Add naturalization pass — 9,025 sayings, 36K training pairs

New pipeline step: naturalize_corpus.py runs Prompt A ("dialect coach") over both polished and previously-discarded sayings, recovering material the first polish pass was too aggressive with. Results: - 9,468 usable from naturalization (vs 5,499 from initial polish) - After dedup: 9,025 unique sayings (was 2,312) - 36,079 training pairs (was 9,257) - 100% vocab coverage, avg 10.1 words (punchier than 13.1) - Relaxed quality filter: drops artifacts/nonsense, not noun presence New scripts: - naturalize_corpus.py: gentle LLM naturalization pass, resume-safe - rebuild_training_pairs.py: combined filter + dedup + training pair generation from naturalized corpus, replaces separate steps Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-10 07:24:37 -04:00 · 2026-03-10 07:24:37 -04:00 · 9298c425bc
commit 9298c425bc
parent 651ec3ffc6
6 changed files with 65131 additions and 11532 deletions
--- a/corpus/corpus_filtered.jsonl
+++ b/corpus/corpus_filtered.jsonl
--- a/corpus/corpus_naturalized.jsonl
+++ b/corpus/corpus_naturalized.jsonl
--- a/corpus/corpus_stats.json
+++ b/corpus/corpus_stats.json
@ -1,91 +1,31 @@
 {
-  "raw_count": 9835,
-  "raw_by_template": {
-    "deconstruction": 1500,
-    "denial_of_consequences": 1500,
-    "false_equivalence": 1500,
-    "futile_preparation": 1500,
-    "hypocritical_complaint": 1500,
-    "ironic_deficiency": 1500,
-    "tautological_wisdom": 835
+  "naturalization_input": 19540,
+  "naturalization_status": {
+    "skipped": 436,
+    "naturalized": 18578,
+    "unchanged": 453,
+    "filtered": 73
  },
-  "polished_count": 5499,
-  "discarded_during_polish": 4336,
-  "errors_during_polish": 0,
-  "polish_discard_rate": "44.1%",
-  "polished_by_template": {
-    "deconstruction": 1105,
-    "denial_of_consequences": 733,
-    "false_equivalence": 590,
-    "futile_preparation": 882,
-    "hypocritical_complaint": 573,
-    "ironic_deficiency": 831,
-    "tautological_wisdom": 785
+  "usable_before_dedup": 19031,
+  "duplicates_removed": 10006,
+  "final_filtered": 9025,
+  "training_pairs": 36079,
+  "by_template": {
+    "deconstruction": 1544,
+    "denial_of_consequences": 750,
+    "false_equivalence": 1897,
+    "futile_preparation": 1735,
+    "hypocritical_complaint": 811,
+    "ironic_deficiency": 1563,
+    "tautological_wisdom": 725
  },
-  "discarded_by_template": {
-    "deconstruction": 395,
-    "denial_of_consequences": 767,
-    "false_equivalence": 910,
-    "futile_preparation": 618,
-    "hypocritical_complaint": 927,
-    "ironic_deficiency": 669,
-    "tautological_wisdom": 50
+  "by_input_type": {
+    "category_seeded": 9025,
+    "open_ended": 2146,
+    "persona_seeded": 9025,
+    "template_seeded": 6858,
+    "word_seeded": 9025
  },
-  "filtered_count": 2312,
-  "filtered_by_template": {
-    "deconstruction": 619,
-    "denial_of_consequences": 159,
-    "false_equivalence": 517,
-    "futile_preparation": 284,
-    "hypocritical_complaint": 168,
-    "ironic_deficiency": 358,
-    "tautological_wisdom": 207
-  },
-  "discarded_during_filter": 3187,
-  "training_pair_count": 9257,
-  "training_by_template": {
-    "deconstruction": 2488,
-    "denial_of_consequences": 630,
-    "false_equivalence": 2059,
-    "futile_preparation": 1146,
-    "hypocritical_complaint": 681,
-    "ironic_deficiency": 1429,
-    "tautological_wisdom": 824
-  },
-  "training_by_input_type": {
-    "category_seeded": 2312,
-    "open_ended": 562,
-    "persona_seeded": 2312,
-    "template_seeded": 1759,
-    "word_seeded": 2312
-  },
-  "unique_slot_words_used": 609,
-  "total_vocab_words": 624,
-  "vocab_coverage": "97.6%",
-  "words_never_used": [
-    "agate",
-    "alabaster",
-    "anise",
-    "azalea",
-    "bee",
-    "blowfish",
-    "cattail",
-    "cypress",
-    "emerald",
-    "gem",
-    "grebe",
-    "juniper",
-    "lyre",
-    "spear",
-    "theater"
-  ],
-  "words_never_used_count": 15,
-  "avg_saying_length_words": 13.1,
-  "min_saying_length_words": 6,
-  "max_saying_length_words": 23,
-  "balance_warnings": [
-    "WARNING: denial_of_consequences has only 159 entries (6.9%) — below 10% threshold. Generate more raw sayings for this family.",
-    "WARNING: hypocritical_complaint has only 168 entries (7.3%) — below 10% threshold. Generate more raw sayings for this family.",
-    "WARNING: tautological_wisdom has only 207 entries (9.0%) — below 10% threshold. Generate more raw sayings for this family."
-  ]
+  "vocab_coverage": "624/624 (100.0%)",
+  "avg_length_words": 10.1
 }
--- a/corpus/training_pairs.jsonl
+++ b/corpus/training_pairs.jsonl
--- a/scripts/naturalize_corpus.py
+++ b/scripts/naturalize_corpus.py
@ -0,0 +1,306 @@
+#!/usr/bin/env python3
+"""Naturalization pass for polished folksy sayings.
+
+Takes the polished corpus (both filtered and fixable discards) and runs a
+second LLM pass focused on making them sound like real folk sayings rather
+than template output. Uses Prompt A (gentle naturalization).
+
+Resume-safe: tracks already-processed entries by raw_text.
+
+Usage:
+  python3 scripts/naturalize_corpus.py
+  python3 scripts/naturalize_corpus.py --input corpus/corpus_polished.jsonl --output corpus/corpus_naturalized.jsonl
+"""
+
+import argparse
+import json
+import sys
+import time
+from pathlib import Path
+
+SCRIPT_DIR = Path(__file__).parent
+PROJECT_DIR = SCRIPT_DIR.parent
+CORPUS_DIR = PROJECT_DIR / "corpus"
+
+LLM_ENDPOINT = "http://192.168.1.100:8853/v1d/chat/completions"
+LLM_MODEL = "THUDM-GLM4-32B"
+
+SYSTEM_PROMPT = """You are a dialect coach for folk sayings. You'll receive a fake proverb that sounds slightly mechanical or template-generated. Your job is to make it sound like something a real person would actually say — natural rhythm, casual grammar, the kind of thing you'd overhear at a general store.
+
+Rules:
+- Keep the same meaning and core nouns
+- Fix awkward phrasing, robotic word order, or template artifacts
+- Make it conversational — contractions, folksy grammar, natural cadence
+- Keep it SHORT (under 20 words preferred)
+- If it already sounds natural, return it unchanged
+- If it's unsalvageable nonsense, respond with: SKIP
+
+Output ONLY the naturalized saying. No quotes, no explanation."""
+
+
+def llm_chat_completion(text, max_retries=3):
+    """Send text for naturalization. Returns (result, error_type)."""
+    import requests
+
+    messages = [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {"role": "user", "content": text},
+    ]
+
+    for attempt in range(max_retries):
+        try:
+            resp = requests.post(LLM_ENDPOINT, json={
+                "model": LLM_MODEL,
+                "messages": messages,
+                "temperature": 0.7,
+            }, timeout=120)
+
+            if resp.status_code == 400:
+                body = resp.text.lower()
+                if any(kw in body for kw in ["context", "token", "length"]):
+                    return None, "context_too_long"
+                if attempt < max_retries - 1:
+                    time.sleep(2 ** attempt)
+                    continue
+                return None, "http_400"
+
+            if resp.status_code in (429, 503):
+                if attempt < max_retries - 1:
+                    time.sleep(2 ** (attempt + 1))
+                    continue
+                return None, "server_overload"
+
+            resp.raise_for_status()
+
+            try:
+                data = resp.json()
+                content = data["choices"][0]["message"]["content"]
+                if content is None:
+                    if attempt < max_retries - 1:
+                        time.sleep(1)
+                        continue
+                    return None, "null_content"
+                return content.strip(), None
+            except (json.JSONDecodeError, KeyError, IndexError) as e:
+                print(f"  Parse error (attempt {attempt+1}): {e}", file=sys.stderr)
+                if attempt < max_retries - 1:
+                    time.sleep(2 ** attempt)
+                    continue
+                return None, "json_error"
+
+        except Exception as e:
+            print(f"  Error (attempt {attempt+1}): {type(e).__name__}: {e}", file=sys.stderr)
+            if attempt < max_retries - 1:
+                time.sleep(2 ** attempt)
+                continue
+            return None, "unexpected"
+
+    return None, "exhausted_retries"
+
+
+def relaxed_quality_filter(text):
+    """Relaxed filter: only catches artifacts and nonsense, not noun presence.
+
+    Returns (passed, reason).
+    """
+    if not text:
+        return False, "empty"
+
+    words = text.split()
+    if len(words) > 25:
+        return False, "too_long"
+    if len(words) < 4:
+        return False, "too_short"
+
+    # Template artifacts
+    if "_" in text:
+        return False, "conceptnet_artifact"
+    if "{" in text or "}" in text:
+        return False, "unfilled_slot"
+
+    # LLM meta-commentary leaks
+    lower = text.lower()
+    if any(kw in lower for kw in ["here's", "here is", "this saying", "i've", "i have",
+                                    "note:", "explanation:", "bridge word"]):
+        return False, "meta_commentary"
+
+    return True, "pass"
+
+
+def load_already_processed(output_path):
+    """Load set of raw_text already processed for resume."""
+    processed = set()
+    counts = {"naturalized": 0, "skipped": 0, "unchanged": 0, "error": 0, "filtered": 0}
+    if output_path.exists():
+        with open(output_path, encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    entry = json.loads(line)
+                    processed.add(entry.get("raw_text", ""))
+                    status = entry.get("naturalize_status", "")
+                    if status in counts:
+                        counts[status] += 1
+                except json.JSONDecodeError:
+                    continue
+    return processed, counts
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Naturalization pass for folksy sayings.")
+    parser.add_argument("--input", default=str(CORPUS_DIR / "corpus_polished.jsonl"),
+                        help="Input polished JSONL file")
+    parser.add_argument("--output", default=str(CORPUS_DIR / "corpus_naturalized.jsonl"),
+                        help="Output naturalized JSONL file")
+    args = parser.parse_args()
+
+    input_path = Path(args.input)
+    output_path = Path(args.output)
+
+    if not input_path.exists():
+        print(f"Error: {input_path} not found.", file=sys.stderr)
+        sys.exit(1)
+
+    # Load all polished entries (both status=polished and status=discarded with raw_text)
+    candidates = []
+    with open(input_path, encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                entry = json.loads(line)
+            except json.JSONDecodeError:
+                continue
+
+            status = entry.get("status", "")
+            if status == "polished":
+                # Already polished — naturalize the polished text
+                candidates.append(entry)
+            elif status == "discarded":
+                # Was discarded by first polish — try naturalizing the raw text
+                raw = entry.get("raw_text", "")
+                if raw and 4 <= len(raw.split()) <= 25 and "_" not in raw:
+                    entry["_from_discard"] = True
+                    candidates.append(entry)
+
+    print(f"Loaded {len(candidates)} candidates "
+          f"({sum(1 for c in candidates if not c.get('_from_discard'))} polished, "
+          f"{sum(1 for c in candidates if c.get('_from_discard'))} from discards)")
+
+    # Resume check
+    already_processed, prev_counts = load_already_processed(output_path)
+    remaining = [e for e in candidates if e.get("raw_text", "") not in already_processed]
+
+    print(f"Already processed: {len(already_processed)} "
+          f"(naturalized={prev_counts['naturalized']}, "
+          f"unchanged={prev_counts['unchanged']}, "
+          f"skipped={prev_counts['skipped']}, "
+          f"filtered={prev_counts['filtered']}, "
+          f"errors={prev_counts['error']})")
+    print(f"Remaining: {len(remaining)}")
+
+    if not remaining:
+        print("Nothing to process.")
+        return
+
+    naturalized = 0
+    unchanged = 0
+    skipped = 0
+    filtered_out = 0
+    errors = 0
+    consecutive_errors = 0
+    start_time = time.time()
+
+    try:
+        with open(output_path, "a", encoding="utf-8") as out:
+            for i, entry in enumerate(remaining):
+                # Determine what text to send
+                if entry.get("_from_discard"):
+                    input_text = entry.get("raw_text", "")
+                else:
+                    input_text = entry.get("polished_text", entry.get("raw_text", ""))
+
+                response, error_type = llm_chat_completion(input_text)
+
+                if response is None:
+                    entry["naturalize_status"] = "error"
+                    entry["naturalize_error"] = error_type
+                    errors += 1
+                    consecutive_errors += 1
+
+                    if consecutive_errors >= 20:
+                        print(f"\nFATAL: {consecutive_errors} consecutive errors. Stopping.",
+                              file=sys.stderr)
+                        out.write(json.dumps(entry, ensure_ascii=False) + "\n")
+                        out.flush()
+                        sys.exit(1)
+
+                elif response.strip().upper() == "SKIP":
+                    entry["naturalize_status"] = "skipped"
+                    skipped += 1
+                    consecutive_errors = 0
+
+                else:
+                    cleaned = response.strip()
+                    # Strip quotes if wrapped
+                    if cleaned.startswith('"') and cleaned.endswith('"'):
+                        cleaned = cleaned[1:-1]
+
+                    # Apply relaxed quality filter
+                    passed, reason = relaxed_quality_filter(cleaned)
+                    if not passed:
+                        entry["naturalize_status"] = "filtered"
+                        entry["naturalize_filter_reason"] = reason
+                        filtered_out += 1
+                    elif cleaned == input_text:
+                        entry["naturalized_text"] = cleaned
+                        entry["naturalize_status"] = "unchanged"
+                        unchanged += 1
+                    else:
+                        entry["naturalized_text"] = cleaned
+                        entry["naturalize_status"] = "naturalized"
+                        naturalized += 1
+
+                    consecutive_errors = 0
+
+                # Clean up internal field
+                entry.pop("_from_discard", None)
+
+                out.write(json.dumps(entry, ensure_ascii=False) + "\n")
+
+                if (i + 1) % 10 == 0:
+                    out.flush()
+
+                if (i + 1) % 100 == 0:
+                    total_done = len(already_processed) + i + 1
+                    elapsed = time.time() - start_time
+                    rate = (i + 1) / elapsed
+                    eta_min = (len(remaining) - (i + 1)) / rate / 60 if rate > 0 else 0
+                    print(f"  [{total_done}/{len(candidates)}] "
+                          f"naturalized={naturalized}, unchanged={unchanged}, "
+                          f"skipped={skipped}, filtered={filtered_out}, errors={errors} "
+                          f"({rate:.1f}/s, ETA {eta_min:.0f}m)")
+
+                time.sleep(0.1)
+
+    except KeyboardInterrupt:
+        print(f"\nInterrupted at {i+1}/{len(remaining)}. Re-run to resume.", file=sys.stderr)
+
+    elapsed = time.time() - start_time
+    total = naturalized + unchanged + skipped + filtered_out + errors
+    print(f"\nSession complete: {total} entries in {elapsed/60:.1f} minutes.")
+    print(f"  Naturalized: {naturalized}")
+    print(f"  Unchanged:   {unchanged}")
+    print(f"  Skipped:     {skipped}")
+    print(f"  Filtered:    {filtered_out}")
+    print(f"  Errors:      {errors}")
+    usable = naturalized + unchanged
+    print(f"  Usable:      {usable}")
+    print(f"Output: {output_path}")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/rebuild_training_pairs.py
+++ b/scripts/rebuild_training_pairs.py
@ -0,0 +1,278 @@
+#!/usr/bin/env python3
+"""Rebuild training pairs from naturalized corpus.
+
+Reads corpus_naturalized.jsonl, applies relaxed quality filter,
+deduplicates, and formats training pairs. Replaces the separate
+filter_corpus.py + format_training_pairs.py steps.
+
+Usage:
+  python3 scripts/rebuild_training_pairs.py
+"""
+
+import argparse
+import csv
+import json
+import random
+import sys
+from collections import Counter
+from difflib import SequenceMatcher
+from pathlib import Path
+
+SCRIPT_DIR = Path(__file__).parent
+PROJECT_DIR = SCRIPT_DIR.parent
+CORPUS_DIR = PROJECT_DIR / "corpus"
+DATA_DIR = PROJECT_DIR / "data"
+EXAMPLES_DIR = PROJECT_DIR / "examples"
+
+PERSONAS = ["farmer", "grandmother", "old sailor", "blacksmith", "innkeeper", "shepherd"]
+
+OPEN_ENDED_PROMPTS = [
+    "Tell me some folk wisdom.",
+    "What do they say?",
+    "Give me a proverb.",
+    "Share some old-time wisdom.",
+    "What's a good saying?",
+]
+
+TEMPLATE_NAMES = {
+    "deconstruction": "deconstruction",
+    "denial_of_consequences": "denial of consequences",
+    "ironic_deficiency": "ironic deficiency",
+    "futile_preparation": "futile preparation",
+    "hypocritical_complaint": "hypocritical complaint",
+    "tautological_wisdom": "tautological wisdom",
+    "false_equivalence": "false equivalence",
+}
+
+
+def is_near_duplicate(text_a, text_b, threshold=0.75):
+    return SequenceMatcher(None, text_a.lower(), text_b.lower()).ratio() > threshold
+
+
+def deduplicate_within_family(entries):
+    by_family = {}
+    for entry in entries:
+        family = entry.get("meta_template", "unknown")
+        by_family.setdefault(family, []).append(entry)
+
+    kept = []
+    removed = 0
+
+    for family, family_entries in by_family.items():
+        family_kept = []
+        for entry in family_entries:
+            text = entry.get("final_text", "")
+            is_dup = False
+            for existing in family_kept:
+                if is_near_duplicate(text, existing.get("final_text", "")):
+                    is_dup = True
+                    break
+            if is_dup:
+                removed += 1
+            else:
+                family_kept.append(entry)
+        kept.extend(family_kept)
+
+    return kept, removed
+
+
+def load_vocab_categories():
+    word_cats = {}
+    vocab_path = DATA_DIR / "folksy_vocab.csv"
+    if vocab_path.exists():
+        with open(vocab_path, newline="", encoding="utf-8") as f:
+            for row in csv.DictReader(f):
+                word = row["word"]
+                cats = [c.strip() for c in row["categories"].split(",") if c.strip()]
+                word_cats[word] = cats
+    return word_cats
+
+
+def generate_training_pairs(entry, word_cats):
+    text = entry.get("final_text", "")
+    slots = entry.get("slots", {})
+    meta_template = entry.get("meta_template", "")
+
+    source_words = [v for v in slots.values()
+                    if v and not v.startswith("a ") and not v.startswith("an ") and len(v) > 1]
+
+    slot_categories = set()
+    for word in source_words:
+        word_lower = word.lower().replace(" ", "_")
+        if word_lower in word_cats:
+            slot_categories.update(word_cats[word_lower])
+
+    pairs = []
+    base = {
+        "output": text,
+        "meta_template": meta_template,
+        "source_words": source_words,
+    }
+
+    if source_words:
+        word = random.choice(source_words)
+        pairs.append({**base, "input": f"Tell me something about {word}."})
+
+    if slot_categories:
+        cat = random.choice(list(slot_categories))
+        pairs.append({**base, "input": f"Tell me a saying about {cat}."})
+
+    persona = random.choice(PERSONAS)
+    if source_words:
+        word = random.choice(source_words)
+        pairs.append({**base, "input": f"What would a {persona} say about {word}?"})
+
+    if random.random() < 0.7:
+        template_name = TEMPLATE_NAMES.get(meta_template, meta_template)
+        pairs.append({**base, "input": f"Give me a {template_name} proverb."})
+
+    if random.random() < 0.3:
+        prompt = random.choice(OPEN_ENDED_PROMPTS)
+        pairs.append({**base, "input": prompt})
+
+    return pairs
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Rebuild training pairs from naturalized corpus.")
+    parser.add_argument("--input", default=str(CORPUS_DIR / "corpus_naturalized.jsonl"))
+    parser.add_argument("--output", default=str(CORPUS_DIR / "training_pairs.jsonl"))
+    parser.add_argument("--filtered-output", default=str(CORPUS_DIR / "corpus_filtered.jsonl"))
+    parser.add_argument("--stats-output", default=str(CORPUS_DIR / "corpus_stats.json"))
+    args = parser.parse_args()
+
+    input_path = Path(args.input)
+    output_path = Path(args.output)
+    filtered_path = Path(args.filtered_output)
+    stats_path = Path(args.stats_output)
+
+    if not input_path.exists():
+        print(f"Error: {input_path} not found.", file=sys.stderr)
+        sys.exit(1)
+
+    # Load naturalized entries — use naturalized_text if available, else polished_text
+    usable = []
+    total_loaded = 0
+    status_counts = Counter()
+
+    with open(input_path, encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                entry = json.loads(line)
+            except json.JSONDecodeError:
+                continue
+            total_loaded += 1
+            nat_status = entry.get("naturalize_status", "")
+            status_counts[nat_status] += 1
+
+            if nat_status in ("naturalized", "unchanged"):
+                final = entry.get("naturalized_text", entry.get("polished_text", ""))
+                if final:
+                    entry["final_text"] = final
+                    usable.append(entry)
+
+    print(f"Loaded {total_loaded} entries from {input_path}")
+    print(f"Status breakdown: {dict(status_counts)}")
+    print(f"Usable (naturalized + unchanged): {len(usable)}")
+
+    # Deduplicate
+    kept, dup_count = deduplicate_within_family(usable)
+    print(f"Near-duplicate removal: {dup_count} removed, {len(kept)} remaining")
+
+    # Write filtered corpus
+    filtered_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(filtered_path, "w", encoding="utf-8") as f:
+        for entry in kept:
+            # Write with final_text as polished_text for compatibility
+            out_entry = {k: v for k, v in entry.items() if k != "final_text"}
+            out_entry["polished_text"] = entry["final_text"]
+            f.write(json.dumps(out_entry, ensure_ascii=False) + "\n")
+
+    print(f"Filtered corpus: {len(kept)} entries -> {filtered_path}")
+
+    # Generate training pairs
+    word_cats = load_vocab_categories()
+    all_pairs = []
+    for entry in kept:
+        pairs = generate_training_pairs(entry, word_cats)
+        all_pairs.extend(pairs)
+
+    with open(output_path, "w", encoding="utf-8") as f:
+        for pair in all_pairs:
+            f.write(json.dumps(pair, ensure_ascii=False) + "\n")
+
+    # Stats
+    template_counts = Counter(e.get("meta_template", "unknown") for e in kept)
+    input_type_counts = Counter()
+    for pair in all_pairs:
+        inp = pair["input"]
+        if inp.startswith("Tell me something about"):
+            input_type_counts["word_seeded"] += 1
+        elif inp.startswith("Tell me a saying about"):
+            input_type_counts["category_seeded"] += 1
+        elif inp.startswith("What would a"):
+            input_type_counts["persona_seeded"] += 1
+        elif inp.startswith("Give me a") and "proverb" in inp:
+            input_type_counts["template_seeded"] += 1
+        else:
+            input_type_counts["open_ended"] += 1
+
+    # Vocab coverage
+    vocab_words = set()
+    vocab_path = DATA_DIR / "folksy_vocab.csv"
+    if vocab_path.exists():
+        with open(vocab_path, newline="", encoding="utf-8") as f:
+            for row in csv.DictReader(f):
+                vocab_words.add(row["word"])
+
+    used_words = set()
+    for entry in kept:
+        for v in entry.get("slots", {}).values():
+            word = v.lower().replace(" ", "_")
+            if word in vocab_words:
+                used_words.add(word)
+
+    lengths = [len(e["final_text"].split()) for e in kept if e.get("final_text")]
+
+    stats = {
+        "naturalization_input": total_loaded,
+        "naturalization_status": dict(status_counts),
+        "usable_before_dedup": len(usable),
+        "duplicates_removed": dup_count,
+        "final_filtered": len(kept),
+        "training_pairs": len(all_pairs),
+        "by_template": dict(sorted(template_counts.items())),
+        "by_input_type": dict(sorted(input_type_counts.items())),
+        "vocab_coverage": f"{len(used_words)}/{len(vocab_words)} ({len(used_words)/len(vocab_words)*100:.1f}%)" if vocab_words else "N/A",
+        "avg_length_words": round(sum(lengths) / len(lengths), 1) if lengths else 0,
+    }
+
+    with open(stats_path, "w", encoding="utf-8") as f:
+        json.dump(stats, f, indent=2, ensure_ascii=False)
+
+    print(f"\n{'='*50}")
+    print(f"FINAL CORPUS STATS")
+    print(f"{'='*50}")
+    print(f"Unique sayings:    {len(kept)}")
+    print(f"Training pairs:    {len(all_pairs)}")
+    print(f"Avg length:        {stats['avg_length_words']} words")
+    print(f"Vocab coverage:    {stats['vocab_coverage']}")
+    print(f"\nBy template:")
+    for t, c in sorted(template_counts.items()):
+        pct = c / len(kept) * 100
+        flag = " <-- below 10%" if pct < 10 else ""
+        print(f"  {t:30s} {c:5d} ({pct:5.1f}%){flag}")
+    print(f"\nBy input type:")
+    for t, c in sorted(input_type_counts.items()):
+        print(f"  {t:20s} {c:5d}")
+    print(f"\nOutputs:")
+    print(f"  {filtered_path}")
+    print(f"  {output_path}")
+    print(f"  {stats_path}")
+
+
+if __name__ == "__main__":
+    main()