Fix generator quality issues and run initial corpus pipeline

Pre-corpus fixes (from EVALUATION.md): - Clean 2,264 contaminated rows from augmented relations (bridge artifacts, full-sentence HasProperty values, null bytes, empty words) - Fix article logic: dynamic a/an across Deconstruction, FalseEquivalence, DenialOfConsequences, TautologicalWisdom templates - Tighten _short_concepts() default from max_words=3 to 2 - Fix FutilePreparation gerunding: filter vocab nouns and noun-suffix words from UsedFor targets; fix CVC doubling for 'y'-ending words - Add _looks_like_verb() heuristic, improve _a() for vowel-sound edges Pipeline hardening: - polish_corpus.py: context-size fallback (truncate chain, then minimal prompt), classified error types, consecutive-error circuit breaker, 10-entry flush granularity, ETA tracking, KeyboardInterrupt handling - generate_raw_batch.sh: fix python -> python3 Corpus generation run (9,835 raw -> 5,499 polished -> 2,312 filtered): - 44.1% discard rate, 0 errors, 82 minutes on RTX 4090 - 9,257 training pairs across 5 input framing types - 97.6% vocab coverage (609/624 words) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-10 04:33:56 -04:00 · 2026-03-10 04:33:56 -04:00 · 651ec3ffc6
commit 651ec3ffc6
parent 356b62c6ea
10 changed files with 34853 additions and 2406 deletions
--- a/corpus/corpus_filtered.jsonl
+++ b/corpus/corpus_filtered.jsonl
--- a/corpus/corpus_polished.jsonl
+++ b/corpus/corpus_polished.jsonl
--- a/corpus/corpus_raw.jsonl
+++ b/corpus/corpus_raw.jsonl
--- a/corpus/corpus_stats.json
+++ b/corpus/corpus_stats.json
@ -0,0 +1,91 @@
+{
+  "raw_count": 9835,
+  "raw_by_template": {
+    "deconstruction": 1500,
+    "denial_of_consequences": 1500,
+    "false_equivalence": 1500,
+    "futile_preparation": 1500,
+    "hypocritical_complaint": 1500,
+    "ironic_deficiency": 1500,
+    "tautological_wisdom": 835
+  },
+  "polished_count": 5499,
+  "discarded_during_polish": 4336,
+  "errors_during_polish": 0,
+  "polish_discard_rate": "44.1%",
+  "polished_by_template": {
+    "deconstruction": 1105,
+    "denial_of_consequences": 733,
+    "false_equivalence": 590,
+    "futile_preparation": 882,
+    "hypocritical_complaint": 573,
+    "ironic_deficiency": 831,
+    "tautological_wisdom": 785
+  },
+  "discarded_by_template": {
+    "deconstruction": 395,
+    "denial_of_consequences": 767,
+    "false_equivalence": 910,
+    "futile_preparation": 618,
+    "hypocritical_complaint": 927,
+    "ironic_deficiency": 669,
+    "tautological_wisdom": 50
+  },
+  "filtered_count": 2312,
+  "filtered_by_template": {
+    "deconstruction": 619,
+    "denial_of_consequences": 159,
+    "false_equivalence": 517,
+    "futile_preparation": 284,
+    "hypocritical_complaint": 168,
+    "ironic_deficiency": 358,
+    "tautological_wisdom": 207
+  },
+  "discarded_during_filter": 3187,
+  "training_pair_count": 9257,
+  "training_by_template": {
+    "deconstruction": 2488,
+    "denial_of_consequences": 630,
+    "false_equivalence": 2059,
+    "futile_preparation": 1146,
+    "hypocritical_complaint": 681,
+    "ironic_deficiency": 1429,
+    "tautological_wisdom": 824
+  },
+  "training_by_input_type": {
+    "category_seeded": 2312,
+    "open_ended": 562,
+    "persona_seeded": 2312,
+    "template_seeded": 1759,
+    "word_seeded": 2312
+  },
+  "unique_slot_words_used": 609,
+  "total_vocab_words": 624,
+  "vocab_coverage": "97.6%",
+  "words_never_used": [
+    "agate",
+    "alabaster",
+    "anise",
+    "azalea",
+    "bee",
+    "blowfish",
+    "cattail",
+    "cypress",
+    "emerald",
+    "gem",
+    "grebe",
+    "juniper",
+    "lyre",
+    "spear",
+    "theater"
+  ],
+  "words_never_used_count": 15,
+  "avg_saying_length_words": 13.1,
+  "min_saying_length_words": 6,
+  "max_saying_length_words": 23,
+  "balance_warnings": [
+    "WARNING: denial_of_consequences has only 159 entries (6.9%) — below 10% threshold. Generate more raw sayings for this family.",
+    "WARNING: hypocritical_complaint has only 168 entries (7.3%) — below 10% threshold. Generate more raw sayings for this family.",
+    "WARNING: tautological_wisdom has only 207 entries (9.0%) — below 10% threshold. Generate more raw sayings for this family."
+  ]
+}
--- a/corpus/discard_analysis.csv
+++ b/corpus/discard_analysis.csv
--- a/corpus/training_pairs.jsonl
+++ b/corpus/training_pairs.jsonl