Add naturalization pass — 9,025 sayings, 36K training pairs

New pipeline step: naturalize_corpus.py runs Prompt A ("dialect coach") over both polished and previously-discarded sayings, recovering material the first polish pass was too aggressive with. Results: - 9,468 usable from naturalization (vs 5,499 from initial polish) - After dedup: 9,025 unique sayings (was 2,312) - 36,079 training pairs (was 9,257) - 100% vocab coverage, avg 10.1 words (punchier than 13.1) - Relaxed quality filter: drops artifacts/nonsense, not noun presence New scripts: - naturalize_corpus.py: gentle LLM naturalization pass, resume-safe - rebuild_training_pairs.py: combined filter + dedup + training pair generation from naturalized corpus, replaces separate steps Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-10 07:24:37 -04:00 · 2026-03-10 07:24:37 -04:00 · 9298c425bc
commit 9298c425bc
parent 651ec3ffc6
6 changed files with 65131 additions and 11532 deletions
--- a/scripts/naturalize_corpus.py
+++ b/scripts/naturalize_corpus.py
@ -0,0 +1,306 @@
+#!/usr/bin/env python3
+"""Naturalization pass for polished folksy sayings.
+
+Takes the polished corpus (both filtered and fixable discards) and runs a
+second LLM pass focused on making them sound like real folk sayings rather
+than template output. Uses Prompt A (gentle naturalization).
+
+Resume-safe: tracks already-processed entries by raw_text.
+
+Usage:
+  python3 scripts/naturalize_corpus.py
+  python3 scripts/naturalize_corpus.py --input corpus/corpus_polished.jsonl --output corpus/corpus_naturalized.jsonl
+"""
+
+import argparse
+import json
+import sys
+import time
+from pathlib import Path
+
+SCRIPT_DIR = Path(__file__).parent
+PROJECT_DIR = SCRIPT_DIR.parent
+CORPUS_DIR = PROJECT_DIR / "corpus"
+
+LLM_ENDPOINT = "http://192.168.1.100:8853/v1d/chat/completions"
+LLM_MODEL = "THUDM-GLM4-32B"
+
+SYSTEM_PROMPT = """You are a dialect coach for folk sayings. You'll receive a fake proverb that sounds slightly mechanical or template-generated. Your job is to make it sound like something a real person would actually say — natural rhythm, casual grammar, the kind of thing you'd overhear at a general store.
+
+Rules:
+- Keep the same meaning and core nouns
+- Fix awkward phrasing, robotic word order, or template artifacts
+- Make it conversational — contractions, folksy grammar, natural cadence
+- Keep it SHORT (under 20 words preferred)
+- If it already sounds natural, return it unchanged
+- If it's unsalvageable nonsense, respond with: SKIP
+
+Output ONLY the naturalized saying. No quotes, no explanation."""
+
+
+def llm_chat_completion(text, max_retries=3):
+    """Send text for naturalization. Returns (result, error_type)."""
+    import requests
+
+    messages = [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {"role": "user", "content": text},
+    ]
+
+    for attempt in range(max_retries):
+        try:
+            resp = requests.post(LLM_ENDPOINT, json={
+                "model": LLM_MODEL,
+                "messages": messages,
+                "temperature": 0.7,
+            }, timeout=120)
+
+            if resp.status_code == 400:
+                body = resp.text.lower()
+                if any(kw in body for kw in ["context", "token", "length"]):
+                    return None, "context_too_long"
+                if attempt < max_retries - 1:
+                    time.sleep(2 ** attempt)
+                    continue
+                return None, "http_400"
+
+            if resp.status_code in (429, 503):
+                if attempt < max_retries - 1:
+                    time.sleep(2 ** (attempt + 1))
+                    continue
+                return None, "server_overload"
+
+            resp.raise_for_status()
+
+            try:
+                data = resp.json()
+                content = data["choices"][0]["message"]["content"]
+                if content is None:
+                    if attempt < max_retries - 1:
+                        time.sleep(1)
+                        continue
+                    return None, "null_content"
+                return content.strip(), None
+            except (json.JSONDecodeError, KeyError, IndexError) as e:
+                print(f"  Parse error (attempt {attempt+1}): {e}", file=sys.stderr)
+                if attempt < max_retries - 1:
+                    time.sleep(2 ** attempt)
+                    continue
+                return None, "json_error"
+
+        except Exception as e:
+            print(f"  Error (attempt {attempt+1}): {type(e).__name__}: {e}", file=sys.stderr)
+            if attempt < max_retries - 1:
+                time.sleep(2 ** attempt)
+                continue
+            return None, "unexpected"
+
+    return None, "exhausted_retries"
+
+
+def relaxed_quality_filter(text):
+    """Relaxed filter: only catches artifacts and nonsense, not noun presence.
+
+    Returns (passed, reason).
+    """
+    if not text:
+        return False, "empty"
+
+    words = text.split()
+    if len(words) > 25:
+        return False, "too_long"
+    if len(words) < 4:
+        return False, "too_short"
+
+    # Template artifacts
+    if "_" in text:
+        return False, "conceptnet_artifact"
+    if "{" in text or "}" in text:
+        return False, "unfilled_slot"
+
+    # LLM meta-commentary leaks
+    lower = text.lower()
+    if any(kw in lower for kw in ["here's", "here is", "this saying", "i've", "i have",
+                                    "note:", "explanation:", "bridge word"]):
+        return False, "meta_commentary"
+
+    return True, "pass"
+
+
+def load_already_processed(output_path):
+    """Load set of raw_text already processed for resume."""
+    processed = set()
+    counts = {"naturalized": 0, "skipped": 0, "unchanged": 0, "error": 0, "filtered": 0}
+    if output_path.exists():
+        with open(output_path, encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    entry = json.loads(line)
+                    processed.add(entry.get("raw_text", ""))
+                    status = entry.get("naturalize_status", "")
+                    if status in counts:
+                        counts[status] += 1
+                except json.JSONDecodeError:
+                    continue
+    return processed, counts
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Naturalization pass for folksy sayings.")
+    parser.add_argument("--input", default=str(CORPUS_DIR / "corpus_polished.jsonl"),
+                        help="Input polished JSONL file")
+    parser.add_argument("--output", default=str(CORPUS_DIR / "corpus_naturalized.jsonl"),
+                        help="Output naturalized JSONL file")
+    args = parser.parse_args()
+
+    input_path = Path(args.input)
+    output_path = Path(args.output)
+
+    if not input_path.exists():
+        print(f"Error: {input_path} not found.", file=sys.stderr)
+        sys.exit(1)
+
+    # Load all polished entries (both status=polished and status=discarded with raw_text)
+    candidates = []
+    with open(input_path, encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                entry = json.loads(line)
+            except json.JSONDecodeError:
+                continue
+
+            status = entry.get("status", "")
+            if status == "polished":
+                # Already polished — naturalize the polished text
+                candidates.append(entry)
+            elif status == "discarded":
+                # Was discarded by first polish — try naturalizing the raw text
+                raw = entry.get("raw_text", "")
+                if raw and 4 <= len(raw.split()) <= 25 and "_" not in raw:
+                    entry["_from_discard"] = True
+                    candidates.append(entry)
+
+    print(f"Loaded {len(candidates)} candidates "
+          f"({sum(1 for c in candidates if not c.get('_from_discard'))} polished, "
+          f"{sum(1 for c in candidates if c.get('_from_discard'))} from discards)")
+
+    # Resume check
+    already_processed, prev_counts = load_already_processed(output_path)
+    remaining = [e for e in candidates if e.get("raw_text", "") not in already_processed]
+
+    print(f"Already processed: {len(already_processed)} "
+          f"(naturalized={prev_counts['naturalized']}, "
+          f"unchanged={prev_counts['unchanged']}, "
+          f"skipped={prev_counts['skipped']}, "
+          f"filtered={prev_counts['filtered']}, "
+          f"errors={prev_counts['error']})")
+    print(f"Remaining: {len(remaining)}")
+
+    if not remaining:
+        print("Nothing to process.")
+        return
+
+    naturalized = 0
+    unchanged = 0
+    skipped = 0
+    filtered_out = 0
+    errors = 0
+    consecutive_errors = 0
+    start_time = time.time()
+
+    try:
+        with open(output_path, "a", encoding="utf-8") as out:
+            for i, entry in enumerate(remaining):
+                # Determine what text to send
+                if entry.get("_from_discard"):
+                    input_text = entry.get("raw_text", "")
+                else:
+                    input_text = entry.get("polished_text", entry.get("raw_text", ""))
+
+                response, error_type = llm_chat_completion(input_text)
+
+                if response is None:
+                    entry["naturalize_status"] = "error"
+                    entry["naturalize_error"] = error_type
+                    errors += 1
+                    consecutive_errors += 1
+
+                    if consecutive_errors >= 20:
+                        print(f"\nFATAL: {consecutive_errors} consecutive errors. Stopping.",
+                              file=sys.stderr)
+                        out.write(json.dumps(entry, ensure_ascii=False) + "\n")
+                        out.flush()
+                        sys.exit(1)
+
+                elif response.strip().upper() == "SKIP":
+                    entry["naturalize_status"] = "skipped"
+                    skipped += 1
+                    consecutive_errors = 0
+
+                else:
+                    cleaned = response.strip()
+                    # Strip quotes if wrapped
+                    if cleaned.startswith('"') and cleaned.endswith('"'):
+                        cleaned = cleaned[1:-1]
+
+                    # Apply relaxed quality filter
+                    passed, reason = relaxed_quality_filter(cleaned)
+                    if not passed:
+                        entry["naturalize_status"] = "filtered"
+                        entry["naturalize_filter_reason"] = reason
+                        filtered_out += 1
+                    elif cleaned == input_text:
+                        entry["naturalized_text"] = cleaned
+                        entry["naturalize_status"] = "unchanged"
+                        unchanged += 1
+                    else:
+                        entry["naturalized_text"] = cleaned
+                        entry["naturalize_status"] = "naturalized"
+                        naturalized += 1
+
+                    consecutive_errors = 0
+
+                # Clean up internal field
+                entry.pop("_from_discard", None)
+
+                out.write(json.dumps(entry, ensure_ascii=False) + "\n")
+
+                if (i + 1) % 10 == 0:
+                    out.flush()
+
+                if (i + 1) % 100 == 0:
+                    total_done = len(already_processed) + i + 1
+                    elapsed = time.time() - start_time
+                    rate = (i + 1) / elapsed
+                    eta_min = (len(remaining) - (i + 1)) / rate / 60 if rate > 0 else 0
+                    print(f"  [{total_done}/{len(candidates)}] "
+                          f"naturalized={naturalized}, unchanged={unchanged}, "
+                          f"skipped={skipped}, filtered={filtered_out}, errors={errors} "
+                          f"({rate:.1f}/s, ETA {eta_min:.0f}m)")
+
+                time.sleep(0.1)
+
+    except KeyboardInterrupt:
+        print(f"\nInterrupted at {i+1}/{len(remaining)}. Re-run to resume.", file=sys.stderr)
+
+    elapsed = time.time() - start_time
+    total = naturalized + unchanged + skipped + filtered_out + errors
+    print(f"\nSession complete: {total} entries in {elapsed/60:.1f} minutes.")
+    print(f"  Naturalized: {naturalized}")
+    print(f"  Unchanged:   {unchanged}")
+    print(f"  Skipped:     {skipped}")
+    print(f"  Filtered:    {filtered_out}")
+    print(f"  Errors:      {errors}")
+    usable = naturalized + unchanged
+    print(f"  Usable:      {usable}")
+    print(f"Output: {output_path}")
+
+
+if __name__ == "__main__":
+    main()