Fix generator quality issues and run initial corpus pipeline
Pre-corpus fixes (from EVALUATION.md): - Clean 2,264 contaminated rows from augmented relations (bridge artifacts, full-sentence HasProperty values, null bytes, empty words) - Fix article logic: dynamic a/an across Deconstruction, FalseEquivalence, DenialOfConsequences, TautologicalWisdom templates - Tighten _short_concepts() default from max_words=3 to 2 - Fix FutilePreparation gerunding: filter vocab nouns and noun-suffix words from UsedFor targets; fix CVC doubling for 'y'-ending words - Add _looks_like_verb() heuristic, improve _a() for vowel-sound edges Pipeline hardening: - polish_corpus.py: context-size fallback (truncate chain, then minimal prompt), classified error types, consecutive-error circuit breaker, 10-entry flush granularity, ETA tracking, KeyboardInterrupt handling - generate_raw_batch.sh: fix python -> python3 Corpus generation run (9,835 raw -> 5,499 polished -> 2,312 filtered): - 44.1% discard rate, 0 errors, 82 minutes on RTX 4090 - 9,257 training pairs across 5 input framing types - 97.6% vocab coverage (609/624 words) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
356b62c6ea
commit
651ec3ffc6
10 changed files with 34853 additions and 2406 deletions
2312
corpus/corpus_filtered.jsonl
Normal file
2312
corpus/corpus_filtered.jsonl
Normal file
File diff suppressed because it is too large
Load diff
9835
corpus/corpus_polished.jsonl
Normal file
9835
corpus/corpus_polished.jsonl
Normal file
File diff suppressed because it is too large
Load diff
9835
corpus/corpus_raw.jsonl
Normal file
9835
corpus/corpus_raw.jsonl
Normal file
File diff suppressed because it is too large
Load diff
91
corpus/corpus_stats.json
Normal file
91
corpus/corpus_stats.json
Normal file
|
|
@ -0,0 +1,91 @@
|
|||
{
|
||||
"raw_count": 9835,
|
||||
"raw_by_template": {
|
||||
"deconstruction": 1500,
|
||||
"denial_of_consequences": 1500,
|
||||
"false_equivalence": 1500,
|
||||
"futile_preparation": 1500,
|
||||
"hypocritical_complaint": 1500,
|
||||
"ironic_deficiency": 1500,
|
||||
"tautological_wisdom": 835
|
||||
},
|
||||
"polished_count": 5499,
|
||||
"discarded_during_polish": 4336,
|
||||
"errors_during_polish": 0,
|
||||
"polish_discard_rate": "44.1%",
|
||||
"polished_by_template": {
|
||||
"deconstruction": 1105,
|
||||
"denial_of_consequences": 733,
|
||||
"false_equivalence": 590,
|
||||
"futile_preparation": 882,
|
||||
"hypocritical_complaint": 573,
|
||||
"ironic_deficiency": 831,
|
||||
"tautological_wisdom": 785
|
||||
},
|
||||
"discarded_by_template": {
|
||||
"deconstruction": 395,
|
||||
"denial_of_consequences": 767,
|
||||
"false_equivalence": 910,
|
||||
"futile_preparation": 618,
|
||||
"hypocritical_complaint": 927,
|
||||
"ironic_deficiency": 669,
|
||||
"tautological_wisdom": 50
|
||||
},
|
||||
"filtered_count": 2312,
|
||||
"filtered_by_template": {
|
||||
"deconstruction": 619,
|
||||
"denial_of_consequences": 159,
|
||||
"false_equivalence": 517,
|
||||
"futile_preparation": 284,
|
||||
"hypocritical_complaint": 168,
|
||||
"ironic_deficiency": 358,
|
||||
"tautological_wisdom": 207
|
||||
},
|
||||
"discarded_during_filter": 3187,
|
||||
"training_pair_count": 9257,
|
||||
"training_by_template": {
|
||||
"deconstruction": 2488,
|
||||
"denial_of_consequences": 630,
|
||||
"false_equivalence": 2059,
|
||||
"futile_preparation": 1146,
|
||||
"hypocritical_complaint": 681,
|
||||
"ironic_deficiency": 1429,
|
||||
"tautological_wisdom": 824
|
||||
},
|
||||
"training_by_input_type": {
|
||||
"category_seeded": 2312,
|
||||
"open_ended": 562,
|
||||
"persona_seeded": 2312,
|
||||
"template_seeded": 1759,
|
||||
"word_seeded": 2312
|
||||
},
|
||||
"unique_slot_words_used": 609,
|
||||
"total_vocab_words": 624,
|
||||
"vocab_coverage": "97.6%",
|
||||
"words_never_used": [
|
||||
"agate",
|
||||
"alabaster",
|
||||
"anise",
|
||||
"azalea",
|
||||
"bee",
|
||||
"blowfish",
|
||||
"cattail",
|
||||
"cypress",
|
||||
"emerald",
|
||||
"gem",
|
||||
"grebe",
|
||||
"juniper",
|
||||
"lyre",
|
||||
"spear",
|
||||
"theater"
|
||||
],
|
||||
"words_never_used_count": 15,
|
||||
"avg_saying_length_words": 13.1,
|
||||
"min_saying_length_words": 6,
|
||||
"max_saying_length_words": 23,
|
||||
"balance_warnings": [
|
||||
"WARNING: denial_of_consequences has only 159 entries (6.9%) — below 10% threshold. Generate more raw sayings for this family.",
|
||||
"WARNING: hypocritical_complaint has only 168 entries (7.3%) — below 10% threshold. Generate more raw sayings for this family.",
|
||||
"WARNING: tautological_wisdom has only 207 entries (9.0%) — below 10% threshold. Generate more raw sayings for this family."
|
||||
]
|
||||
}
|
||||
3188
corpus/discard_analysis.csv
Normal file
3188
corpus/discard_analysis.csv
Normal file
File diff suppressed because it is too large
Load diff
9257
corpus/training_pairs.jsonl
Normal file
9257
corpus/training_pairs.jsonl
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue