#!/usr/bin/env python3 """Naturalization pass for polished folksy sayings. Takes the polished corpus (both filtered and fixable discards) and runs a second LLM pass focused on making them sound like real folk sayings rather than template output. Uses Prompt A (gentle naturalization). Resume-safe: tracks already-processed entries by raw_text. Usage: python3 scripts/naturalize_corpus.py python3 scripts/naturalize_corpus.py --input corpus/corpus_polished.jsonl --output corpus/corpus_naturalized.jsonl """ import argparse import json import sys import time from pathlib import Path SCRIPT_DIR = Path(__file__).parent PROJECT_DIR = SCRIPT_DIR.parent CORPUS_DIR = PROJECT_DIR / "corpus" LLM_ENDPOINT = "http://192.168.1.100:8853/v1d/chat/completions" LLM_MODEL = "THUDM-GLM4-32B" SYSTEM_PROMPT = """You are a dialect coach for folk sayings. You'll receive a fake proverb that sounds slightly mechanical or template-generated. Your job is to make it sound like something a real person would actually say — natural rhythm, casual grammar, the kind of thing you'd overhear at a general store. Rules: - Keep the same meaning and core nouns - Fix awkward phrasing, robotic word order, or template artifacts - Make it conversational — contractions, folksy grammar, natural cadence - Keep it SHORT (under 20 words preferred) - If it already sounds natural, return it unchanged - If it's unsalvageable nonsense, respond with: SKIP Output ONLY the naturalized saying. No quotes, no explanation.""" def llm_chat_completion(text, max_retries=3): """Send text for naturalization. Returns (result, error_type).""" import requests messages = [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": text}, ] for attempt in range(max_retries): try: resp = requests.post(LLM_ENDPOINT, json={ "model": LLM_MODEL, "messages": messages, "temperature": 0.7, }, timeout=120) if resp.status_code == 400: body = resp.text.lower() if any(kw in body for kw in ["context", "token", "length"]): return None, "context_too_long" if attempt < max_retries - 1: time.sleep(2 ** attempt) continue return None, "http_400" if resp.status_code in (429, 503): if attempt < max_retries - 1: time.sleep(2 ** (attempt + 1)) continue return None, "server_overload" resp.raise_for_status() try: data = resp.json() content = data["choices"][0]["message"]["content"] if content is None: if attempt < max_retries - 1: time.sleep(1) continue return None, "null_content" return content.strip(), None except (json.JSONDecodeError, KeyError, IndexError) as e: print(f" Parse error (attempt {attempt+1}): {e}", file=sys.stderr) if attempt < max_retries - 1: time.sleep(2 ** attempt) continue return None, "json_error" except Exception as e: print(f" Error (attempt {attempt+1}): {type(e).__name__}: {e}", file=sys.stderr) if attempt < max_retries - 1: time.sleep(2 ** attempt) continue return None, "unexpected" return None, "exhausted_retries" def relaxed_quality_filter(text): """Relaxed filter: only catches artifacts and nonsense, not noun presence. Returns (passed, reason). """ if not text: return False, "empty" words = text.split() if len(words) > 25: return False, "too_long" if len(words) < 4: return False, "too_short" # Template artifacts if "_" in text: return False, "conceptnet_artifact" if "{" in text or "}" in text: return False, "unfilled_slot" # LLM meta-commentary leaks lower = text.lower() if any(kw in lower for kw in ["here's", "here is", "this saying", "i've", "i have", "note:", "explanation:", "bridge word"]): return False, "meta_commentary" return True, "pass" def load_already_processed(output_path): """Load set of raw_text already processed for resume.""" processed = set() counts = {"naturalized": 0, "skipped": 0, "unchanged": 0, "error": 0, "filtered": 0} if output_path.exists(): with open(output_path, encoding="utf-8") as f: for line in f: line = line.strip() if not line: continue try: entry = json.loads(line) processed.add(entry.get("raw_text", "")) status = entry.get("naturalize_status", "") if status in counts: counts[status] += 1 except json.JSONDecodeError: continue return processed, counts def main(): parser = argparse.ArgumentParser(description="Naturalization pass for folksy sayings.") parser.add_argument("--input", default=str(CORPUS_DIR / "corpus_polished.jsonl"), help="Input polished JSONL file") parser.add_argument("--output", default=str(CORPUS_DIR / "corpus_naturalized.jsonl"), help="Output naturalized JSONL file") args = parser.parse_args() input_path = Path(args.input) output_path = Path(args.output) if not input_path.exists(): print(f"Error: {input_path} not found.", file=sys.stderr) sys.exit(1) # Load all polished entries (both status=polished and status=discarded with raw_text) candidates = [] with open(input_path, encoding="utf-8") as f: for line in f: line = line.strip() if not line: continue try: entry = json.loads(line) except json.JSONDecodeError: continue status = entry.get("status", "") if status == "polished": # Already polished — naturalize the polished text candidates.append(entry) elif status == "discarded": # Was discarded by first polish — try naturalizing the raw text raw = entry.get("raw_text", "") if raw and 4 <= len(raw.split()) <= 25 and "_" not in raw: entry["_from_discard"] = True candidates.append(entry) print(f"Loaded {len(candidates)} candidates " f"({sum(1 for c in candidates if not c.get('_from_discard'))} polished, " f"{sum(1 for c in candidates if c.get('_from_discard'))} from discards)") # Resume check already_processed, prev_counts = load_already_processed(output_path) remaining = [e for e in candidates if e.get("raw_text", "") not in already_processed] print(f"Already processed: {len(already_processed)} " f"(naturalized={prev_counts['naturalized']}, " f"unchanged={prev_counts['unchanged']}, " f"skipped={prev_counts['skipped']}, " f"filtered={prev_counts['filtered']}, " f"errors={prev_counts['error']})") print(f"Remaining: {len(remaining)}") if not remaining: print("Nothing to process.") return naturalized = 0 unchanged = 0 skipped = 0 filtered_out = 0 errors = 0 consecutive_errors = 0 start_time = time.time() try: with open(output_path, "a", encoding="utf-8") as out: for i, entry in enumerate(remaining): # Determine what text to send if entry.get("_from_discard"): input_text = entry.get("raw_text", "") else: input_text = entry.get("polished_text", entry.get("raw_text", "")) response, error_type = llm_chat_completion(input_text) if response is None: entry["naturalize_status"] = "error" entry["naturalize_error"] = error_type errors += 1 consecutive_errors += 1 if consecutive_errors >= 20: print(f"\nFATAL: {consecutive_errors} consecutive errors. Stopping.", file=sys.stderr) out.write(json.dumps(entry, ensure_ascii=False) + "\n") out.flush() sys.exit(1) elif response.strip().upper() == "SKIP": entry["naturalize_status"] = "skipped" skipped += 1 consecutive_errors = 0 else: cleaned = response.strip() # Strip quotes if wrapped if cleaned.startswith('"') and cleaned.endswith('"'): cleaned = cleaned[1:-1] # Apply relaxed quality filter passed, reason = relaxed_quality_filter(cleaned) if not passed: entry["naturalize_status"] = "filtered" entry["naturalize_filter_reason"] = reason filtered_out += 1 elif cleaned == input_text: entry["naturalized_text"] = cleaned entry["naturalize_status"] = "unchanged" unchanged += 1 else: entry["naturalized_text"] = cleaned entry["naturalize_status"] = "naturalized" naturalized += 1 consecutive_errors = 0 # Clean up internal field entry.pop("_from_discard", None) out.write(json.dumps(entry, ensure_ascii=False) + "\n") if (i + 1) % 10 == 0: out.flush() if (i + 1) % 100 == 0: total_done = len(already_processed) + i + 1 elapsed = time.time() - start_time rate = (i + 1) / elapsed eta_min = (len(remaining) - (i + 1)) / rate / 60 if rate > 0 else 0 print(f" [{total_done}/{len(candidates)}] " f"naturalized={naturalized}, unchanged={unchanged}, " f"skipped={skipped}, filtered={filtered_out}, errors={errors} " f"({rate:.1f}/s, ETA {eta_min:.0f}m)") time.sleep(0.1) except KeyboardInterrupt: print(f"\nInterrupted at {i+1}/{len(remaining)}. Re-run to resume.", file=sys.stderr) elapsed = time.time() - start_time total = naturalized + unchanged + skipped + filtered_out + errors print(f"\nSession complete: {total} entries in {elapsed/60:.1f} minutes.") print(f" Naturalized: {naturalized}") print(f" Unchanged: {unchanged}") print(f" Skipped: {skipped}") print(f" Filtered: {filtered_out}") print(f" Errors: {errors}") usable = naturalized + unchanged print(f" Usable: {usable}") print(f"Output: {output_path}") if __name__ == "__main__": main()