Add naturalization pass — 9,025 sayings, 36K training pairs
New pipeline step: naturalize_corpus.py runs Prompt A ("dialect coach")
over both polished and previously-discarded sayings, recovering material
the first polish pass was too aggressive with.
Results:
- 9,468 usable from naturalization (vs 5,499 from initial polish)
- After dedup: 9,025 unique sayings (was 2,312)
- 36,079 training pairs (was 9,257)
- 100% vocab coverage, avg 10.1 words (punchier than 13.1)
- Relaxed quality filter: drops artifacts/nonsense, not noun presence
New scripts:
- naturalize_corpus.py: gentle LLM naturalization pass, resume-safe
- rebuild_training_pairs.py: combined filter + dedup + training pair
generation from naturalized corpus, replaces separate steps
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
651ec3ffc6
commit
9298c425bc
6 changed files with 65131 additions and 11532 deletions
306
scripts/naturalize_corpus.py
Normal file
306
scripts/naturalize_corpus.py
Normal file
|
|
@ -0,0 +1,306 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Naturalization pass for polished folksy sayings.
|
||||
|
||||
Takes the polished corpus (both filtered and fixable discards) and runs a
|
||||
second LLM pass focused on making them sound like real folk sayings rather
|
||||
than template output. Uses Prompt A (gentle naturalization).
|
||||
|
||||
Resume-safe: tracks already-processed entries by raw_text.
|
||||
|
||||
Usage:
|
||||
python3 scripts/naturalize_corpus.py
|
||||
python3 scripts/naturalize_corpus.py --input corpus/corpus_polished.jsonl --output corpus/corpus_naturalized.jsonl
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
SCRIPT_DIR = Path(__file__).parent
|
||||
PROJECT_DIR = SCRIPT_DIR.parent
|
||||
CORPUS_DIR = PROJECT_DIR / "corpus"
|
||||
|
||||
LLM_ENDPOINT = "http://192.168.1.100:8853/v1d/chat/completions"
|
||||
LLM_MODEL = "THUDM-GLM4-32B"
|
||||
|
||||
SYSTEM_PROMPT = """You are a dialect coach for folk sayings. You'll receive a fake proverb that sounds slightly mechanical or template-generated. Your job is to make it sound like something a real person would actually say — natural rhythm, casual grammar, the kind of thing you'd overhear at a general store.
|
||||
|
||||
Rules:
|
||||
- Keep the same meaning and core nouns
|
||||
- Fix awkward phrasing, robotic word order, or template artifacts
|
||||
- Make it conversational — contractions, folksy grammar, natural cadence
|
||||
- Keep it SHORT (under 20 words preferred)
|
||||
- If it already sounds natural, return it unchanged
|
||||
- If it's unsalvageable nonsense, respond with: SKIP
|
||||
|
||||
Output ONLY the naturalized saying. No quotes, no explanation."""
|
||||
|
||||
|
||||
def llm_chat_completion(text, max_retries=3):
|
||||
"""Send text for naturalization. Returns (result, error_type)."""
|
||||
import requests
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{"role": "user", "content": text},
|
||||
]
|
||||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
resp = requests.post(LLM_ENDPOINT, json={
|
||||
"model": LLM_MODEL,
|
||||
"messages": messages,
|
||||
"temperature": 0.7,
|
||||
}, timeout=120)
|
||||
|
||||
if resp.status_code == 400:
|
||||
body = resp.text.lower()
|
||||
if any(kw in body for kw in ["context", "token", "length"]):
|
||||
return None, "context_too_long"
|
||||
if attempt < max_retries - 1:
|
||||
time.sleep(2 ** attempt)
|
||||
continue
|
||||
return None, "http_400"
|
||||
|
||||
if resp.status_code in (429, 503):
|
||||
if attempt < max_retries - 1:
|
||||
time.sleep(2 ** (attempt + 1))
|
||||
continue
|
||||
return None, "server_overload"
|
||||
|
||||
resp.raise_for_status()
|
||||
|
||||
try:
|
||||
data = resp.json()
|
||||
content = data["choices"][0]["message"]["content"]
|
||||
if content is None:
|
||||
if attempt < max_retries - 1:
|
||||
time.sleep(1)
|
||||
continue
|
||||
return None, "null_content"
|
||||
return content.strip(), None
|
||||
except (json.JSONDecodeError, KeyError, IndexError) as e:
|
||||
print(f" Parse error (attempt {attempt+1}): {e}", file=sys.stderr)
|
||||
if attempt < max_retries - 1:
|
||||
time.sleep(2 ** attempt)
|
||||
continue
|
||||
return None, "json_error"
|
||||
|
||||
except Exception as e:
|
||||
print(f" Error (attempt {attempt+1}): {type(e).__name__}: {e}", file=sys.stderr)
|
||||
if attempt < max_retries - 1:
|
||||
time.sleep(2 ** attempt)
|
||||
continue
|
||||
return None, "unexpected"
|
||||
|
||||
return None, "exhausted_retries"
|
||||
|
||||
|
||||
def relaxed_quality_filter(text):
|
||||
"""Relaxed filter: only catches artifacts and nonsense, not noun presence.
|
||||
|
||||
Returns (passed, reason).
|
||||
"""
|
||||
if not text:
|
||||
return False, "empty"
|
||||
|
||||
words = text.split()
|
||||
if len(words) > 25:
|
||||
return False, "too_long"
|
||||
if len(words) < 4:
|
||||
return False, "too_short"
|
||||
|
||||
# Template artifacts
|
||||
if "_" in text:
|
||||
return False, "conceptnet_artifact"
|
||||
if "{" in text or "}" in text:
|
||||
return False, "unfilled_slot"
|
||||
|
||||
# LLM meta-commentary leaks
|
||||
lower = text.lower()
|
||||
if any(kw in lower for kw in ["here's", "here is", "this saying", "i've", "i have",
|
||||
"note:", "explanation:", "bridge word"]):
|
||||
return False, "meta_commentary"
|
||||
|
||||
return True, "pass"
|
||||
|
||||
|
||||
def load_already_processed(output_path):
|
||||
"""Load set of raw_text already processed for resume."""
|
||||
processed = set()
|
||||
counts = {"naturalized": 0, "skipped": 0, "unchanged": 0, "error": 0, "filtered": 0}
|
||||
if output_path.exists():
|
||||
with open(output_path, encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
entry = json.loads(line)
|
||||
processed.add(entry.get("raw_text", ""))
|
||||
status = entry.get("naturalize_status", "")
|
||||
if status in counts:
|
||||
counts[status] += 1
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
return processed, counts
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Naturalization pass for folksy sayings.")
|
||||
parser.add_argument("--input", default=str(CORPUS_DIR / "corpus_polished.jsonl"),
|
||||
help="Input polished JSONL file")
|
||||
parser.add_argument("--output", default=str(CORPUS_DIR / "corpus_naturalized.jsonl"),
|
||||
help="Output naturalized JSONL file")
|
||||
args = parser.parse_args()
|
||||
|
||||
input_path = Path(args.input)
|
||||
output_path = Path(args.output)
|
||||
|
||||
if not input_path.exists():
|
||||
print(f"Error: {input_path} not found.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Load all polished entries (both status=polished and status=discarded with raw_text)
|
||||
candidates = []
|
||||
with open(input_path, encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
entry = json.loads(line)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
status = entry.get("status", "")
|
||||
if status == "polished":
|
||||
# Already polished — naturalize the polished text
|
||||
candidates.append(entry)
|
||||
elif status == "discarded":
|
||||
# Was discarded by first polish — try naturalizing the raw text
|
||||
raw = entry.get("raw_text", "")
|
||||
if raw and 4 <= len(raw.split()) <= 25 and "_" not in raw:
|
||||
entry["_from_discard"] = True
|
||||
candidates.append(entry)
|
||||
|
||||
print(f"Loaded {len(candidates)} candidates "
|
||||
f"({sum(1 for c in candidates if not c.get('_from_discard'))} polished, "
|
||||
f"{sum(1 for c in candidates if c.get('_from_discard'))} from discards)")
|
||||
|
||||
# Resume check
|
||||
already_processed, prev_counts = load_already_processed(output_path)
|
||||
remaining = [e for e in candidates if e.get("raw_text", "") not in already_processed]
|
||||
|
||||
print(f"Already processed: {len(already_processed)} "
|
||||
f"(naturalized={prev_counts['naturalized']}, "
|
||||
f"unchanged={prev_counts['unchanged']}, "
|
||||
f"skipped={prev_counts['skipped']}, "
|
||||
f"filtered={prev_counts['filtered']}, "
|
||||
f"errors={prev_counts['error']})")
|
||||
print(f"Remaining: {len(remaining)}")
|
||||
|
||||
if not remaining:
|
||||
print("Nothing to process.")
|
||||
return
|
||||
|
||||
naturalized = 0
|
||||
unchanged = 0
|
||||
skipped = 0
|
||||
filtered_out = 0
|
||||
errors = 0
|
||||
consecutive_errors = 0
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
with open(output_path, "a", encoding="utf-8") as out:
|
||||
for i, entry in enumerate(remaining):
|
||||
# Determine what text to send
|
||||
if entry.get("_from_discard"):
|
||||
input_text = entry.get("raw_text", "")
|
||||
else:
|
||||
input_text = entry.get("polished_text", entry.get("raw_text", ""))
|
||||
|
||||
response, error_type = llm_chat_completion(input_text)
|
||||
|
||||
if response is None:
|
||||
entry["naturalize_status"] = "error"
|
||||
entry["naturalize_error"] = error_type
|
||||
errors += 1
|
||||
consecutive_errors += 1
|
||||
|
||||
if consecutive_errors >= 20:
|
||||
print(f"\nFATAL: {consecutive_errors} consecutive errors. Stopping.",
|
||||
file=sys.stderr)
|
||||
out.write(json.dumps(entry, ensure_ascii=False) + "\n")
|
||||
out.flush()
|
||||
sys.exit(1)
|
||||
|
||||
elif response.strip().upper() == "SKIP":
|
||||
entry["naturalize_status"] = "skipped"
|
||||
skipped += 1
|
||||
consecutive_errors = 0
|
||||
|
||||
else:
|
||||
cleaned = response.strip()
|
||||
# Strip quotes if wrapped
|
||||
if cleaned.startswith('"') and cleaned.endswith('"'):
|
||||
cleaned = cleaned[1:-1]
|
||||
|
||||
# Apply relaxed quality filter
|
||||
passed, reason = relaxed_quality_filter(cleaned)
|
||||
if not passed:
|
||||
entry["naturalize_status"] = "filtered"
|
||||
entry["naturalize_filter_reason"] = reason
|
||||
filtered_out += 1
|
||||
elif cleaned == input_text:
|
||||
entry["naturalized_text"] = cleaned
|
||||
entry["naturalize_status"] = "unchanged"
|
||||
unchanged += 1
|
||||
else:
|
||||
entry["naturalized_text"] = cleaned
|
||||
entry["naturalize_status"] = "naturalized"
|
||||
naturalized += 1
|
||||
|
||||
consecutive_errors = 0
|
||||
|
||||
# Clean up internal field
|
||||
entry.pop("_from_discard", None)
|
||||
|
||||
out.write(json.dumps(entry, ensure_ascii=False) + "\n")
|
||||
|
||||
if (i + 1) % 10 == 0:
|
||||
out.flush()
|
||||
|
||||
if (i + 1) % 100 == 0:
|
||||
total_done = len(already_processed) + i + 1
|
||||
elapsed = time.time() - start_time
|
||||
rate = (i + 1) / elapsed
|
||||
eta_min = (len(remaining) - (i + 1)) / rate / 60 if rate > 0 else 0
|
||||
print(f" [{total_done}/{len(candidates)}] "
|
||||
f"naturalized={naturalized}, unchanged={unchanged}, "
|
||||
f"skipped={skipped}, filtered={filtered_out}, errors={errors} "
|
||||
f"({rate:.1f}/s, ETA {eta_min:.0f}m)")
|
||||
|
||||
time.sleep(0.1)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print(f"\nInterrupted at {i+1}/{len(remaining)}. Re-run to resume.", file=sys.stderr)
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
total = naturalized + unchanged + skipped + filtered_out + errors
|
||||
print(f"\nSession complete: {total} entries in {elapsed/60:.1f} minutes.")
|
||||
print(f" Naturalized: {naturalized}")
|
||||
print(f" Unchanged: {unchanged}")
|
||||
print(f" Skipped: {skipped}")
|
||||
print(f" Filtered: {filtered_out}")
|
||||
print(f" Errors: {errors}")
|
||||
usable = naturalized + unchanged
|
||||
print(f" Usable: {usable}")
|
||||
print(f"Output: {output_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue