Add naturalization pass — 9,025 sayings, 36K training pairs

New pipeline step: naturalize_corpus.py runs Prompt A ("dialect coach")
over both polished and previously-discarded sayings, recovering material
the first polish pass was too aggressive with.

Results:
- 9,468 usable from naturalization (vs 5,499 from initial polish)
- After dedup: 9,025 unique sayings (was 2,312)
- 36,079 training pairs (was 9,257)
- 100% vocab coverage, avg 10.1 words (punchier than 13.1)
- Relaxed quality filter: drops artifacts/nonsense, not noun presence

New scripts:
- naturalize_corpus.py: gentle LLM naturalization pass, resume-safe
- rebuild_training_pairs.py: combined filter + dedup + training pair
  generation from naturalized corpus, replaces separate steps

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
John McCardle 2026-03-10 07:24:37 -04:00
commit 9298c425bc
6 changed files with 65131 additions and 11532 deletions

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -1,91 +1,31 @@
{
"raw_count": 9835,
"raw_by_template": {
"deconstruction": 1500,
"denial_of_consequences": 1500,
"false_equivalence": 1500,
"futile_preparation": 1500,
"hypocritical_complaint": 1500,
"ironic_deficiency": 1500,
"tautological_wisdom": 835
"naturalization_input": 19540,
"naturalization_status": {
"skipped": 436,
"naturalized": 18578,
"unchanged": 453,
"filtered": 73
},
"polished_count": 5499,
"discarded_during_polish": 4336,
"errors_during_polish": 0,
"polish_discard_rate": "44.1%",
"polished_by_template": {
"deconstruction": 1105,
"denial_of_consequences": 733,
"false_equivalence": 590,
"futile_preparation": 882,
"hypocritical_complaint": 573,
"ironic_deficiency": 831,
"tautological_wisdom": 785
"usable_before_dedup": 19031,
"duplicates_removed": 10006,
"final_filtered": 9025,
"training_pairs": 36079,
"by_template": {
"deconstruction": 1544,
"denial_of_consequences": 750,
"false_equivalence": 1897,
"futile_preparation": 1735,
"hypocritical_complaint": 811,
"ironic_deficiency": 1563,
"tautological_wisdom": 725
},
"discarded_by_template": {
"deconstruction": 395,
"denial_of_consequences": 767,
"false_equivalence": 910,
"futile_preparation": 618,
"hypocritical_complaint": 927,
"ironic_deficiency": 669,
"tautological_wisdom": 50
"by_input_type": {
"category_seeded": 9025,
"open_ended": 2146,
"persona_seeded": 9025,
"template_seeded": 6858,
"word_seeded": 9025
},
"filtered_count": 2312,
"filtered_by_template": {
"deconstruction": 619,
"denial_of_consequences": 159,
"false_equivalence": 517,
"futile_preparation": 284,
"hypocritical_complaint": 168,
"ironic_deficiency": 358,
"tautological_wisdom": 207
},
"discarded_during_filter": 3187,
"training_pair_count": 9257,
"training_by_template": {
"deconstruction": 2488,
"denial_of_consequences": 630,
"false_equivalence": 2059,
"futile_preparation": 1146,
"hypocritical_complaint": 681,
"ironic_deficiency": 1429,
"tautological_wisdom": 824
},
"training_by_input_type": {
"category_seeded": 2312,
"open_ended": 562,
"persona_seeded": 2312,
"template_seeded": 1759,
"word_seeded": 2312
},
"unique_slot_words_used": 609,
"total_vocab_words": 624,
"vocab_coverage": "97.6%",
"words_never_used": [
"agate",
"alabaster",
"anise",
"azalea",
"bee",
"blowfish",
"cattail",
"cypress",
"emerald",
"gem",
"grebe",
"juniper",
"lyre",
"spear",
"theater"
],
"words_never_used_count": 15,
"avg_saying_length_words": 13.1,
"min_saying_length_words": 6,
"max_saying_length_words": 23,
"balance_warnings": [
"WARNING: denial_of_consequences has only 159 entries (6.9%) — below 10% threshold. Generate more raw sayings for this family.",
"WARNING: hypocritical_complaint has only 168 entries (7.3%) — below 10% threshold. Generate more raw sayings for this family.",
"WARNING: tautological_wisdom has only 207 entries (9.0%) — below 10% threshold. Generate more raw sayings for this family."
]
"vocab_coverage": "624/624 (100.0%)",
"avg_length_words": 10.1
}

File diff suppressed because it is too large Load diff