Add naturalization pass — 9,025 sayings, 36K training pairs
New pipeline step: naturalize_corpus.py runs Prompt A ("dialect coach")
over both polished and previously-discarded sayings, recovering material
the first polish pass was too aggressive with.
Results:
- 9,468 usable from naturalization (vs 5,499 from initial polish)
- After dedup: 9,025 unique sayings (was 2,312)
- 36,079 training pairs (was 9,257)
- 100% vocab coverage, avg 10.1 words (punchier than 13.1)
- Relaxed quality filter: drops artifacts/nonsense, not noun presence
New scripts:
- naturalize_corpus.py: gentle LLM naturalization pass, resume-safe
- rebuild_training_pairs.py: combined filter + dedup + training pair
generation from naturalized corpus, replaces separate steps
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
651ec3ffc6
commit
9298c425bc
6 changed files with 65131 additions and 11532 deletions
File diff suppressed because it is too large
Load diff
19540
corpus/corpus_naturalized.jsonl
Normal file
19540
corpus/corpus_naturalized.jsonl
Normal file
File diff suppressed because it is too large
Load diff
|
|
@ -1,91 +1,31 @@
|
|||
{
|
||||
"raw_count": 9835,
|
||||
"raw_by_template": {
|
||||
"deconstruction": 1500,
|
||||
"denial_of_consequences": 1500,
|
||||
"false_equivalence": 1500,
|
||||
"futile_preparation": 1500,
|
||||
"hypocritical_complaint": 1500,
|
||||
"ironic_deficiency": 1500,
|
||||
"tautological_wisdom": 835
|
||||
"naturalization_input": 19540,
|
||||
"naturalization_status": {
|
||||
"skipped": 436,
|
||||
"naturalized": 18578,
|
||||
"unchanged": 453,
|
||||
"filtered": 73
|
||||
},
|
||||
"polished_count": 5499,
|
||||
"discarded_during_polish": 4336,
|
||||
"errors_during_polish": 0,
|
||||
"polish_discard_rate": "44.1%",
|
||||
"polished_by_template": {
|
||||
"deconstruction": 1105,
|
||||
"denial_of_consequences": 733,
|
||||
"false_equivalence": 590,
|
||||
"futile_preparation": 882,
|
||||
"hypocritical_complaint": 573,
|
||||
"ironic_deficiency": 831,
|
||||
"tautological_wisdom": 785
|
||||
"usable_before_dedup": 19031,
|
||||
"duplicates_removed": 10006,
|
||||
"final_filtered": 9025,
|
||||
"training_pairs": 36079,
|
||||
"by_template": {
|
||||
"deconstruction": 1544,
|
||||
"denial_of_consequences": 750,
|
||||
"false_equivalence": 1897,
|
||||
"futile_preparation": 1735,
|
||||
"hypocritical_complaint": 811,
|
||||
"ironic_deficiency": 1563,
|
||||
"tautological_wisdom": 725
|
||||
},
|
||||
"discarded_by_template": {
|
||||
"deconstruction": 395,
|
||||
"denial_of_consequences": 767,
|
||||
"false_equivalence": 910,
|
||||
"futile_preparation": 618,
|
||||
"hypocritical_complaint": 927,
|
||||
"ironic_deficiency": 669,
|
||||
"tautological_wisdom": 50
|
||||
"by_input_type": {
|
||||
"category_seeded": 9025,
|
||||
"open_ended": 2146,
|
||||
"persona_seeded": 9025,
|
||||
"template_seeded": 6858,
|
||||
"word_seeded": 9025
|
||||
},
|
||||
"filtered_count": 2312,
|
||||
"filtered_by_template": {
|
||||
"deconstruction": 619,
|
||||
"denial_of_consequences": 159,
|
||||
"false_equivalence": 517,
|
||||
"futile_preparation": 284,
|
||||
"hypocritical_complaint": 168,
|
||||
"ironic_deficiency": 358,
|
||||
"tautological_wisdom": 207
|
||||
},
|
||||
"discarded_during_filter": 3187,
|
||||
"training_pair_count": 9257,
|
||||
"training_by_template": {
|
||||
"deconstruction": 2488,
|
||||
"denial_of_consequences": 630,
|
||||
"false_equivalence": 2059,
|
||||
"futile_preparation": 1146,
|
||||
"hypocritical_complaint": 681,
|
||||
"ironic_deficiency": 1429,
|
||||
"tautological_wisdom": 824
|
||||
},
|
||||
"training_by_input_type": {
|
||||
"category_seeded": 2312,
|
||||
"open_ended": 562,
|
||||
"persona_seeded": 2312,
|
||||
"template_seeded": 1759,
|
||||
"word_seeded": 2312
|
||||
},
|
||||
"unique_slot_words_used": 609,
|
||||
"total_vocab_words": 624,
|
||||
"vocab_coverage": "97.6%",
|
||||
"words_never_used": [
|
||||
"agate",
|
||||
"alabaster",
|
||||
"anise",
|
||||
"azalea",
|
||||
"bee",
|
||||
"blowfish",
|
||||
"cattail",
|
||||
"cypress",
|
||||
"emerald",
|
||||
"gem",
|
||||
"grebe",
|
||||
"juniper",
|
||||
"lyre",
|
||||
"spear",
|
||||
"theater"
|
||||
],
|
||||
"words_never_used_count": 15,
|
||||
"avg_saying_length_words": 13.1,
|
||||
"min_saying_length_words": 6,
|
||||
"max_saying_length_words": 23,
|
||||
"balance_warnings": [
|
||||
"WARNING: denial_of_consequences has only 159 entries (6.9%) — below 10% threshold. Generate more raw sayings for this family.",
|
||||
"WARNING: hypocritical_complaint has only 168 entries (7.3%) — below 10% threshold. Generate more raw sayings for this family.",
|
||||
"WARNING: tautological_wisdom has only 207 entries (9.0%) — below 10% threshold. Generate more raw sayings for this family."
|
||||
]
|
||||
"vocab_coverage": "624/624 (100.0%)",
|
||||
"avg_length_words": 10.1
|
||||
}
|
||||
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue