Add naturalization pass — 9,025 sayings, 36K training pairs

New pipeline step: naturalize_corpus.py runs Prompt A ("dialect coach") over both polished and previously-discarded sayings, recovering material the first polish pass was too aggressive with. Results: - 9,468 usable from naturalization (vs 5,499 from initial polish) - After dedup: 9,025 unique sayings (was 2,312) - 36,079 training pairs (was 9,257) - 100% vocab coverage, avg 10.1 words (punchier than 13.1) - Relaxed quality filter: drops artifacts/nonsense, not noun presence New scripts: - naturalize_corpus.py: gentle LLM naturalization pass, resume-safe - rebuild_training_pairs.py: combined filter + dedup + training pair generation from naturalized corpus, replaces separate steps Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-10 07:24:37 -04:00 · 2026-03-10 07:24:37 -04:00 · 9298c425bc
commit 9298c425bc
parent 651ec3ffc6
6 changed files with 65131 additions and 11532 deletions
--- a/corpus/corpus_filtered.jsonl
+++ b/corpus/corpus_filtered.jsonl
--- a/corpus/corpus_naturalized.jsonl
+++ b/corpus/corpus_naturalized.jsonl
--- a/corpus/corpus_stats.json
+++ b/corpus/corpus_stats.json
@ -1,91 +1,31 @@
 {
-  "raw_count": 9835,
+  "naturalization_input": 19540,
-  "raw_by_template": {
+  "naturalization_status": {
-    "deconstruction": 1500,
+    "skipped": 436,
-    "denial_of_consequences": 1500,
+    "naturalized": 18578,
-    "false_equivalence": 1500,
+    "unchanged": 453,
-    "futile_preparation": 1500,
+    "filtered": 73
    "hypocritical_complaint": 1500,
    "ironic_deficiency": 1500,
    "tautological_wisdom": 835
  },
-  "polished_count": 5499,
+  "usable_before_dedup": 19031,
-  "discarded_during_polish": 4336,
+  "duplicates_removed": 10006,
-  "errors_during_polish": 0,
+  "final_filtered": 9025,
-  "polish_discard_rate": "44.1%",
+  "training_pairs": 36079,
-  "polished_by_template": {
+  "by_template": {
-    "deconstruction": 1105,
+    "deconstruction": 1544,
-    "denial_of_consequences": 733,
+    "denial_of_consequences": 750,
-    "false_equivalence": 590,
+    "false_equivalence": 1897,
-    "futile_preparation": 882,
+    "futile_preparation": 1735,
-    "hypocritical_complaint": 573,
+    "hypocritical_complaint": 811,
-    "ironic_deficiency": 831,
+    "ironic_deficiency": 1563,
-    "tautological_wisdom": 785
+    "tautological_wisdom": 725
  },
-  "discarded_by_template": {
+  "by_input_type": {
-    "deconstruction": 395,
+    "category_seeded": 9025,
-    "denial_of_consequences": 767,
+    "open_ended": 2146,
-    "false_equivalence": 910,
+    "persona_seeded": 9025,
-    "futile_preparation": 618,
+    "template_seeded": 6858,
-    "hypocritical_complaint": 927,
+    "word_seeded": 9025
    "ironic_deficiency": 669,
    "tautological_wisdom": 50
  },
-  "filtered_count": 2312,
+  "vocab_coverage": "624/624 (100.0%)",
-  "filtered_by_template": {
+  "avg_length_words": 10.1
    "deconstruction": 619,
    "denial_of_consequences": 159,
    "false_equivalence": 517,
    "futile_preparation": 284,
    "hypocritical_complaint": 168,
    "ironic_deficiency": 358,
    "tautological_wisdom": 207
  },
  "discarded_during_filter": 3187,
  "training_pair_count": 9257,
  "training_by_template": {
    "deconstruction": 2488,
    "denial_of_consequences": 630,
    "false_equivalence": 2059,
    "futile_preparation": 1146,
    "hypocritical_complaint": 681,
    "ironic_deficiency": 1429,
    "tautological_wisdom": 824
  },
  "training_by_input_type": {
    "category_seeded": 2312,
    "open_ended": 562,
    "persona_seeded": 2312,
    "template_seeded": 1759,
    "word_seeded": 2312
  },
  "unique_slot_words_used": 609,
  "total_vocab_words": 624,
  "vocab_coverage": "97.6%",
  "words_never_used": [
    "agate",
    "alabaster",
    "anise",
    "azalea",
    "bee",
    "blowfish",
    "cattail",
    "cypress",
    "emerald",
    "gem",
    "grebe",
    "juniper",
    "lyre",
    "spear",
    "theater"
  ],
  "words_never_used_count": 15,
  "avg_saying_length_words": 13.1,
  "min_saying_length_words": 6,
  "max_saying_length_words": 23,
  "balance_warnings": [
    "WARNING: denial_of_consequences has only 159 entries (6.9%) — below 10% threshold. Generate more raw sayings for this family.",
    "WARNING: hypocritical_complaint has only 168 entries (7.3%) — below 10% threshold. Generate more raw sayings for this family.",
    "WARNING: tautological_wisdom has only 207 entries (9.0%) — below 10% threshold. Generate more raw sayings for this family."
  ]
 }
--- a/corpus/training_pairs.jsonl
+++ b/corpus/training_pairs.jsonl
--- a/scripts/naturalize_corpus.py
+++ b/scripts/naturalize_corpus.py
@ -0,0 +1,306 @@
 #!/usr/bin/env python3
 """Naturalization pass for polished folksy sayings.
 Takes the polished corpus (both filtered and fixable discards) and runs a
 second LLM pass focused on making them sound like real folk sayings rather
 than template output. Uses Prompt A (gentle naturalization).
 Resume-safe: tracks already-processed entries by raw_text.
 Usage:
  python3 scripts/naturalize_corpus.py
  python3 scripts/naturalize_corpus.py --input corpus/corpus_polished.jsonl --output corpus/corpus_naturalized.jsonl
 """
 import argparse
 import json
 import sys
 import time
 from pathlib import Path
 SCRIPT_DIR = Path(__file__).parent
 PROJECT_DIR = SCRIPT_DIR.parent
 CORPUS_DIR = PROJECT_DIR / "corpus"
 LLM_ENDPOINT = "http://192.168.1.100:8853/v1d/chat/completions"
 LLM_MODEL = "THUDM-GLM4-32B"
 SYSTEM_PROMPT = """You are a dialect coach for folk sayings. You'll receive a fake proverb that sounds slightly mechanical or template-generated. Your job is to make it sound like something a real person would actually say — natural rhythm, casual grammar, the kind of thing you'd overhear at a general store.
 Rules:
 - Keep the same meaning and core nouns
 - Fix awkward phrasing, robotic word order, or template artifacts
 - Make it conversational — contractions, folksy grammar, natural cadence
 - Keep it SHORT (under 20 words preferred)
 - If it already sounds natural, return it unchanged
 - If it's unsalvageable nonsense, respond with: SKIP
 Output ONLY the naturalized saying. No quotes, no explanation."""
 def llm_chat_completion(text, max_retries=3):
    """Send text for naturalization. Returns (result, error_type)."""
    import requests
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": text},
    ]
    for attempt in range(max_retries):
        try:
            resp = requests.post(LLM_ENDPOINT, json={
                "model": LLM_MODEL,
                "messages": messages,
                "temperature": 0.7,
            }, timeout=120)
            if resp.status_code == 400:
                body = resp.text.lower()
                if any(kw in body for kw in ["context", "token", "length"]):
                    return None, "context_too_long"
                if attempt < max_retries - 1:
                    time.sleep(2 ** attempt)
                    continue
                return None, "http_400"
            if resp.status_code in (429, 503):
                if attempt < max_retries - 1:
                    time.sleep(2 ** (attempt + 1))
                    continue
                return None, "server_overload"
            resp.raise_for_status()
            try:
                data = resp.json()
                content = data["choices"][0]["message"]["content"]
                if content is None:
                    if attempt < max_retries - 1:
                        time.sleep(1)
                        continue
                    return None, "null_content"
                return content.strip(), None
            except (json.JSONDecodeError, KeyError, IndexError) as e:
                print(f"  Parse error (attempt {attempt+1}): {e}", file=sys.stderr)
                if attempt < max_retries - 1:
                    time.sleep(2 ** attempt)
                    continue
                return None, "json_error"
        except Exception as e:
            print(f"  Error (attempt {attempt+1}): {type(e).__name__}: {e}", file=sys.stderr)
            if attempt < max_retries - 1:
                time.sleep(2 ** attempt)
                continue
            return None, "unexpected"
    return None, "exhausted_retries"
 def relaxed_quality_filter(text):
    """Relaxed filter: only catches artifacts and nonsense, not noun presence.
    Returns (passed, reason).
    """
    if not text:
        return False, "empty"
    words = text.split()
    if len(words) > 25:
        return False, "too_long"
    if len(words) < 4:
        return False, "too_short"
    # Template artifacts
    if "_" in text:
        return False, "conceptnet_artifact"
    if "{" in text or "}" in text:
        return False, "unfilled_slot"
    # LLM meta-commentary leaks
    lower = text.lower()
    if any(kw in lower for kw in ["here's", "here is", "this saying", "i've", "i have",
                                    "note:", "explanation:", "bridge word"]):
        return False, "meta_commentary"
    return True, "pass"
 def load_already_processed(output_path):
    """Load set of raw_text already processed for resume."""
    processed = set()
    counts = {"naturalized": 0, "skipped": 0, "unchanged": 0, "error": 0, "filtered": 0}
    if output_path.exists():
        with open(output_path, encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                try:
                    entry = json.loads(line)
                    processed.add(entry.get("raw_text", ""))
                    status = entry.get("naturalize_status", "")
                    if status in counts:
                        counts[status] += 1
                except json.JSONDecodeError:
                    continue
    return processed, counts
 def main():
    parser = argparse.ArgumentParser(description="Naturalization pass for folksy sayings.")
    parser.add_argument("--input", default=str(CORPUS_DIR / "corpus_polished.jsonl"),
                        help="Input polished JSONL file")
    parser.add_argument("--output", default=str(CORPUS_DIR / "corpus_naturalized.jsonl"),
                        help="Output naturalized JSONL file")
    args = parser.parse_args()
    input_path = Path(args.input)
    output_path = Path(args.output)
    if not input_path.exists():
        print(f"Error: {input_path} not found.", file=sys.stderr)
        sys.exit(1)
    # Load all polished entries (both status=polished and status=discarded with raw_text)
    candidates = []
    with open(input_path, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                entry = json.loads(line)
            except json.JSONDecodeError:
                continue
            status = entry.get("status", "")
            if status == "polished":
                # Already polished — naturalize the polished text
                candidates.append(entry)
            elif status == "discarded":
                # Was discarded by first polish — try naturalizing the raw text
                raw = entry.get("raw_text", "")
                if raw and 4 <= len(raw.split()) <= 25 and "_" not in raw:
                    entry["_from_discard"] = True
                    candidates.append(entry)
    print(f"Loaded {len(candidates)} candidates "
          f"({sum(1 for c in candidates if not c.get('_from_discard'))} polished, "
          f"{sum(1 for c in candidates if c.get('_from_discard'))} from discards)")
    # Resume check
    already_processed, prev_counts = load_already_processed(output_path)
    remaining = [e for e in candidates if e.get("raw_text", "") not in already_processed]
    print(f"Already processed: {len(already_processed)} "
          f"(naturalized={prev_counts['naturalized']}, "
          f"unchanged={prev_counts['unchanged']}, "
          f"skipped={prev_counts['skipped']}, "
          f"filtered={prev_counts['filtered']}, "
          f"errors={prev_counts['error']})")
    print(f"Remaining: {len(remaining)}")
    if not remaining:
        print("Nothing to process.")
        return
    naturalized = 0
    unchanged = 0
    skipped = 0
    filtered_out = 0
    errors = 0
    consecutive_errors = 0
    start_time = time.time()
    try:
        with open(output_path, "a", encoding="utf-8") as out:
            for i, entry in enumerate(remaining):
                # Determine what text to send
                if entry.get("_from_discard"):
                    input_text = entry.get("raw_text", "")
                else:
                    input_text = entry.get("polished_text", entry.get("raw_text", ""))
                response, error_type = llm_chat_completion(input_text)
                if response is None:
                    entry["naturalize_status"] = "error"
                    entry["naturalize_error"] = error_type
                    errors += 1
                    consecutive_errors += 1
                    if consecutive_errors >= 20:
                        print(f"\nFATAL: {consecutive_errors} consecutive errors. Stopping.",
                              file=sys.stderr)
                        out.write(json.dumps(entry, ensure_ascii=False) + "\n")
                        out.flush()
                        sys.exit(1)
                elif response.strip().upper() == "SKIP":
                    entry["naturalize_status"] = "skipped"
                    skipped += 1
                    consecutive_errors = 0
                else:
                    cleaned = response.strip()
                    # Strip quotes if wrapped
                    if cleaned.startswith('"') and cleaned.endswith('"'):
                        cleaned = cleaned[1:-1]
                    # Apply relaxed quality filter
                    passed, reason = relaxed_quality_filter(cleaned)
                    if not passed:
                        entry["naturalize_status"] = "filtered"
                        entry["naturalize_filter_reason"] = reason
                        filtered_out += 1
                    elif cleaned == input_text:
                        entry["naturalized_text"] = cleaned
                        entry["naturalize_status"] = "unchanged"
                        unchanged += 1
                    else:
                        entry["naturalized_text"] = cleaned
                        entry["naturalize_status"] = "naturalized"
                        naturalized += 1
                    consecutive_errors = 0
                # Clean up internal field
                entry.pop("_from_discard", None)
                out.write(json.dumps(entry, ensure_ascii=False) + "\n")
                if (i + 1) % 10 == 0:
                    out.flush()
                if (i + 1) % 100 == 0:
                    total_done = len(already_processed) + i + 1
                    elapsed = time.time() - start_time
                    rate = (i + 1) / elapsed
                    eta_min = (len(remaining) - (i + 1)) / rate / 60 if rate > 0 else 0
                    print(f"  [{total_done}/{len(candidates)}] "
                          f"naturalized={naturalized}, unchanged={unchanged}, "
                          f"skipped={skipped}, filtered={filtered_out}, errors={errors} "
                          f"({rate:.1f}/s, ETA {eta_min:.0f}m)")
                time.sleep(0.1)
    except KeyboardInterrupt:
        print(f"\nInterrupted at {i+1}/{len(remaining)}. Re-run to resume.", file=sys.stderr)
    elapsed = time.time() - start_time
    total = naturalized + unchanged + skipped + filtered_out + errors
    print(f"\nSession complete: {total} entries in {elapsed/60:.1f} minutes.")
    print(f"  Naturalized: {naturalized}")
    print(f"  Unchanged:   {unchanged}")
    print(f"  Skipped:     {skipped}")
    print(f"  Filtered:    {filtered_out}")
    print(f"  Errors:      {errors}")
    usable = naturalized + unchanged
    print(f"  Usable:      {usable}")
    print(f"Output: {output_path}")
 if __name__ == "__main__":
    main()
--- a/scripts/rebuild_training_pairs.py
+++ b/scripts/rebuild_training_pairs.py
@ -0,0 +1,278 @@
 #!/usr/bin/env python3
 """Rebuild training pairs from naturalized corpus.
 Reads corpus_naturalized.jsonl, applies relaxed quality filter,
 deduplicates, and formats training pairs. Replaces the separate
 filter_corpus.py + format_training_pairs.py steps.
 Usage:
  python3 scripts/rebuild_training_pairs.py
 """
 import argparse
 import csv
 import json
 import random
 import sys
 from collections import Counter
 from difflib import SequenceMatcher
 from pathlib import Path
 SCRIPT_DIR = Path(__file__).parent
 PROJECT_DIR = SCRIPT_DIR.parent
 CORPUS_DIR = PROJECT_DIR / "corpus"
 DATA_DIR = PROJECT_DIR / "data"
 EXAMPLES_DIR = PROJECT_DIR / "examples"
 PERSONAS = ["farmer", "grandmother", "old sailor", "blacksmith", "innkeeper", "shepherd"]
 OPEN_ENDED_PROMPTS = [
    "Tell me some folk wisdom.",
    "What do they say?",
    "Give me a proverb.",
    "Share some old-time wisdom.",
    "What's a good saying?",
 ]
 TEMPLATE_NAMES = {
    "deconstruction": "deconstruction",
    "denial_of_consequences": "denial of consequences",
    "ironic_deficiency": "ironic deficiency",
    "futile_preparation": "futile preparation",
    "hypocritical_complaint": "hypocritical complaint",
    "tautological_wisdom": "tautological wisdom",
    "false_equivalence": "false equivalence",
 }
 def is_near_duplicate(text_a, text_b, threshold=0.75):
    return SequenceMatcher(None, text_a.lower(), text_b.lower()).ratio() > threshold
 def deduplicate_within_family(entries):
    by_family = {}
    for entry in entries:
        family = entry.get("meta_template", "unknown")
        by_family.setdefault(family, []).append(entry)
    kept = []
    removed = 0
    for family, family_entries in by_family.items():
        family_kept = []
        for entry in family_entries:
            text = entry.get("final_text", "")
            is_dup = False
            for existing in family_kept:
                if is_near_duplicate(text, existing.get("final_text", "")):
                    is_dup = True
                    break
            if is_dup:
                removed += 1
            else:
                family_kept.append(entry)
        kept.extend(family_kept)
    return kept, removed
 def load_vocab_categories():
    word_cats = {}
    vocab_path = DATA_DIR / "folksy_vocab.csv"
    if vocab_path.exists():
        with open(vocab_path, newline="", encoding="utf-8") as f:
            for row in csv.DictReader(f):
                word = row["word"]
                cats = [c.strip() for c in row["categories"].split(",") if c.strip()]
                word_cats[word] = cats
    return word_cats
 def generate_training_pairs(entry, word_cats):
    text = entry.get("final_text", "")
    slots = entry.get("slots", {})
    meta_template = entry.get("meta_template", "")
    source_words = [v for v in slots.values()
                    if v and not v.startswith("a ") and not v.startswith("an ") and len(v) > 1]
    slot_categories = set()
    for word in source_words:
        word_lower = word.lower().replace(" ", "_")
        if word_lower in word_cats:
            slot_categories.update(word_cats[word_lower])
    pairs = []
    base = {
        "output": text,
        "meta_template": meta_template,
        "source_words": source_words,
    }
    if source_words:
        word = random.choice(source_words)
        pairs.append({**base, "input": f"Tell me something about {word}."})
    if slot_categories:
        cat = random.choice(list(slot_categories))
        pairs.append({**base, "input": f"Tell me a saying about {cat}."})
    persona = random.choice(PERSONAS)
    if source_words:
        word = random.choice(source_words)
        pairs.append({**base, "input": f"What would a {persona} say about {word}?"})
    if random.random() < 0.7:
        template_name = TEMPLATE_NAMES.get(meta_template, meta_template)
        pairs.append({**base, "input": f"Give me a {template_name} proverb."})
    if random.random() < 0.3:
        prompt = random.choice(OPEN_ENDED_PROMPTS)
        pairs.append({**base, "input": prompt})
    return pairs
 def main():
    parser = argparse.ArgumentParser(description="Rebuild training pairs from naturalized corpus.")
    parser.add_argument("--input", default=str(CORPUS_DIR / "corpus_naturalized.jsonl"))
    parser.add_argument("--output", default=str(CORPUS_DIR / "training_pairs.jsonl"))
    parser.add_argument("--filtered-output", default=str(CORPUS_DIR / "corpus_filtered.jsonl"))
    parser.add_argument("--stats-output", default=str(CORPUS_DIR / "corpus_stats.json"))
    args = parser.parse_args()
    input_path = Path(args.input)
    output_path = Path(args.output)
    filtered_path = Path(args.filtered_output)
    stats_path = Path(args.stats_output)
    if not input_path.exists():
        print(f"Error: {input_path} not found.", file=sys.stderr)
        sys.exit(1)
    # Load naturalized entries — use naturalized_text if available, else polished_text
    usable = []
    total_loaded = 0
    status_counts = Counter()
    with open(input_path, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                entry = json.loads(line)
            except json.JSONDecodeError:
                continue
            total_loaded += 1
            nat_status = entry.get("naturalize_status", "")
            status_counts[nat_status] += 1
            if nat_status in ("naturalized", "unchanged"):
                final = entry.get("naturalized_text", entry.get("polished_text", ""))
                if final:
                    entry["final_text"] = final
                    usable.append(entry)
    print(f"Loaded {total_loaded} entries from {input_path}")
    print(f"Status breakdown: {dict(status_counts)}")
    print(f"Usable (naturalized + unchanged): {len(usable)}")
    # Deduplicate
    kept, dup_count = deduplicate_within_family(usable)
    print(f"Near-duplicate removal: {dup_count} removed, {len(kept)} remaining")
    # Write filtered corpus
    filtered_path.parent.mkdir(parents=True, exist_ok=True)
    with open(filtered_path, "w", encoding="utf-8") as f:
        for entry in kept:
            # Write with final_text as polished_text for compatibility
            out_entry = {k: v for k, v in entry.items() if k != "final_text"}
            out_entry["polished_text"] = entry["final_text"]
            f.write(json.dumps(out_entry, ensure_ascii=False) + "\n")
    print(f"Filtered corpus: {len(kept)} entries -> {filtered_path}")
    # Generate training pairs
    word_cats = load_vocab_categories()
    all_pairs = []
    for entry in kept:
        pairs = generate_training_pairs(entry, word_cats)
        all_pairs.extend(pairs)
    with open(output_path, "w", encoding="utf-8") as f:
        for pair in all_pairs:
            f.write(json.dumps(pair, ensure_ascii=False) + "\n")
    # Stats
    template_counts = Counter(e.get("meta_template", "unknown") for e in kept)
    input_type_counts = Counter()
    for pair in all_pairs:
        inp = pair["input"]
        if inp.startswith("Tell me something about"):
            input_type_counts["word_seeded"] += 1
        elif inp.startswith("Tell me a saying about"):
            input_type_counts["category_seeded"] += 1
        elif inp.startswith("What would a"):
            input_type_counts["persona_seeded"] += 1
        elif inp.startswith("Give me a") and "proverb" in inp:
            input_type_counts["template_seeded"] += 1
        else:
            input_type_counts["open_ended"] += 1
    # Vocab coverage
    vocab_words = set()
    vocab_path = DATA_DIR / "folksy_vocab.csv"
    if vocab_path.exists():
        with open(vocab_path, newline="", encoding="utf-8") as f:
            for row in csv.DictReader(f):
                vocab_words.add(row["word"])
    used_words = set()
    for entry in kept:
        for v in entry.get("slots", {}).values():
            word = v.lower().replace(" ", "_")
            if word in vocab_words:
                used_words.add(word)
    lengths = [len(e["final_text"].split()) for e in kept if e.get("final_text")]
    stats = {
        "naturalization_input": total_loaded,
        "naturalization_status": dict(status_counts),
        "usable_before_dedup": len(usable),
        "duplicates_removed": dup_count,
        "final_filtered": len(kept),
        "training_pairs": len(all_pairs),
        "by_template": dict(sorted(template_counts.items())),
        "by_input_type": dict(sorted(input_type_counts.items())),
        "vocab_coverage": f"{len(used_words)}/{len(vocab_words)} ({len(used_words)/len(vocab_words)*100:.1f}%)" if vocab_words else "N/A",
        "avg_length_words": round(sum(lengths) / len(lengths), 1) if lengths else 0,
    }
    with open(stats_path, "w", encoding="utf-8") as f:
        json.dump(stats, f, indent=2, ensure_ascii=False)
    print(f"\n{'='*50}")
    print(f"FINAL CORPUS STATS")
    print(f"{'='*50}")
    print(f"Unique sayings:    {len(kept)}")
    print(f"Training pairs:    {len(all_pairs)}")
    print(f"Avg length:        {stats['avg_length_words']} words")
    print(f"Vocab coverage:    {stats['vocab_coverage']}")
    print(f"\nBy template:")
    for t, c in sorted(template_counts.items()):
        pct = c / len(kept) * 100
        flag = " <-- below 10%" if pct < 10 else ""
        print(f"  {t:30s} {c:5d} ({pct:5.1f}%){flag}")
    print(f"\nBy input type:")
    for t, c in sorted(input_type_counts.items()):
        print(f"  {t:20s} {c:5d}")
    print(f"\nOutputs:")
    print(f"  {filtered_path}")
    print(f"  {output_path}")
    print(f"  {stats_path}")
 if __name__ == "__main__":
    main()