Add naturalization pass — 9,025 sayings, 36K training pairs
New pipeline step: naturalize_corpus.py runs Prompt A ("dialect coach")
over both polished and previously-discarded sayings, recovering material
the first polish pass was too aggressive with.
Results:
- 9,468 usable from naturalization (vs 5,499 from initial polish)
- After dedup: 9,025 unique sayings (was 2,312)
- 36,079 training pairs (was 9,257)
- 100% vocab coverage, avg 10.1 words (punchier than 13.1)
- Relaxed quality filter: drops artifacts/nonsense, not noun presence
New scripts:
- naturalize_corpus.py: gentle LLM naturalization pass, resume-safe
- rebuild_training_pairs.py: combined filter + dedup + training pair
generation from naturalized corpus, replaces separate steps
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
651ec3ffc6
commit
9298c425bc
6 changed files with 65131 additions and 11532 deletions
File diff suppressed because it is too large
Load diff
19540
corpus/corpus_naturalized.jsonl
Normal file
19540
corpus/corpus_naturalized.jsonl
Normal file
File diff suppressed because it is too large
Load diff
|
|
@ -1,91 +1,31 @@
|
||||||
{
|
{
|
||||||
"raw_count": 9835,
|
"naturalization_input": 19540,
|
||||||
"raw_by_template": {
|
"naturalization_status": {
|
||||||
"deconstruction": 1500,
|
"skipped": 436,
|
||||||
"denial_of_consequences": 1500,
|
"naturalized": 18578,
|
||||||
"false_equivalence": 1500,
|
"unchanged": 453,
|
||||||
"futile_preparation": 1500,
|
"filtered": 73
|
||||||
"hypocritical_complaint": 1500,
|
|
||||||
"ironic_deficiency": 1500,
|
|
||||||
"tautological_wisdom": 835
|
|
||||||
},
|
},
|
||||||
"polished_count": 5499,
|
"usable_before_dedup": 19031,
|
||||||
"discarded_during_polish": 4336,
|
"duplicates_removed": 10006,
|
||||||
"errors_during_polish": 0,
|
"final_filtered": 9025,
|
||||||
"polish_discard_rate": "44.1%",
|
"training_pairs": 36079,
|
||||||
"polished_by_template": {
|
"by_template": {
|
||||||
"deconstruction": 1105,
|
"deconstruction": 1544,
|
||||||
"denial_of_consequences": 733,
|
"denial_of_consequences": 750,
|
||||||
"false_equivalence": 590,
|
"false_equivalence": 1897,
|
||||||
"futile_preparation": 882,
|
"futile_preparation": 1735,
|
||||||
"hypocritical_complaint": 573,
|
"hypocritical_complaint": 811,
|
||||||
"ironic_deficiency": 831,
|
"ironic_deficiency": 1563,
|
||||||
"tautological_wisdom": 785
|
"tautological_wisdom": 725
|
||||||
},
|
},
|
||||||
"discarded_by_template": {
|
"by_input_type": {
|
||||||
"deconstruction": 395,
|
"category_seeded": 9025,
|
||||||
"denial_of_consequences": 767,
|
"open_ended": 2146,
|
||||||
"false_equivalence": 910,
|
"persona_seeded": 9025,
|
||||||
"futile_preparation": 618,
|
"template_seeded": 6858,
|
||||||
"hypocritical_complaint": 927,
|
"word_seeded": 9025
|
||||||
"ironic_deficiency": 669,
|
|
||||||
"tautological_wisdom": 50
|
|
||||||
},
|
},
|
||||||
"filtered_count": 2312,
|
"vocab_coverage": "624/624 (100.0%)",
|
||||||
"filtered_by_template": {
|
"avg_length_words": 10.1
|
||||||
"deconstruction": 619,
|
|
||||||
"denial_of_consequences": 159,
|
|
||||||
"false_equivalence": 517,
|
|
||||||
"futile_preparation": 284,
|
|
||||||
"hypocritical_complaint": 168,
|
|
||||||
"ironic_deficiency": 358,
|
|
||||||
"tautological_wisdom": 207
|
|
||||||
},
|
|
||||||
"discarded_during_filter": 3187,
|
|
||||||
"training_pair_count": 9257,
|
|
||||||
"training_by_template": {
|
|
||||||
"deconstruction": 2488,
|
|
||||||
"denial_of_consequences": 630,
|
|
||||||
"false_equivalence": 2059,
|
|
||||||
"futile_preparation": 1146,
|
|
||||||
"hypocritical_complaint": 681,
|
|
||||||
"ironic_deficiency": 1429,
|
|
||||||
"tautological_wisdom": 824
|
|
||||||
},
|
|
||||||
"training_by_input_type": {
|
|
||||||
"category_seeded": 2312,
|
|
||||||
"open_ended": 562,
|
|
||||||
"persona_seeded": 2312,
|
|
||||||
"template_seeded": 1759,
|
|
||||||
"word_seeded": 2312
|
|
||||||
},
|
|
||||||
"unique_slot_words_used": 609,
|
|
||||||
"total_vocab_words": 624,
|
|
||||||
"vocab_coverage": "97.6%",
|
|
||||||
"words_never_used": [
|
|
||||||
"agate",
|
|
||||||
"alabaster",
|
|
||||||
"anise",
|
|
||||||
"azalea",
|
|
||||||
"bee",
|
|
||||||
"blowfish",
|
|
||||||
"cattail",
|
|
||||||
"cypress",
|
|
||||||
"emerald",
|
|
||||||
"gem",
|
|
||||||
"grebe",
|
|
||||||
"juniper",
|
|
||||||
"lyre",
|
|
||||||
"spear",
|
|
||||||
"theater"
|
|
||||||
],
|
|
||||||
"words_never_used_count": 15,
|
|
||||||
"avg_saying_length_words": 13.1,
|
|
||||||
"min_saying_length_words": 6,
|
|
||||||
"max_saying_length_words": 23,
|
|
||||||
"balance_warnings": [
|
|
||||||
"WARNING: denial_of_consequences has only 159 entries (6.9%) — below 10% threshold. Generate more raw sayings for this family.",
|
|
||||||
"WARNING: hypocritical_complaint has only 168 entries (7.3%) — below 10% threshold. Generate more raw sayings for this family.",
|
|
||||||
"WARNING: tautological_wisdom has only 207 entries (9.0%) — below 10% threshold. Generate more raw sayings for this family."
|
|
||||||
]
|
|
||||||
}
|
}
|
||||||
File diff suppressed because it is too large
Load diff
306
scripts/naturalize_corpus.py
Normal file
306
scripts/naturalize_corpus.py
Normal file
|
|
@ -0,0 +1,306 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Naturalization pass for polished folksy sayings.
|
||||||
|
|
||||||
|
Takes the polished corpus (both filtered and fixable discards) and runs a
|
||||||
|
second LLM pass focused on making them sound like real folk sayings rather
|
||||||
|
than template output. Uses Prompt A (gentle naturalization).
|
||||||
|
|
||||||
|
Resume-safe: tracks already-processed entries by raw_text.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python3 scripts/naturalize_corpus.py
|
||||||
|
python3 scripts/naturalize_corpus.py --input corpus/corpus_polished.jsonl --output corpus/corpus_naturalized.jsonl
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
SCRIPT_DIR = Path(__file__).parent
|
||||||
|
PROJECT_DIR = SCRIPT_DIR.parent
|
||||||
|
CORPUS_DIR = PROJECT_DIR / "corpus"
|
||||||
|
|
||||||
|
LLM_ENDPOINT = "http://192.168.1.100:8853/v1d/chat/completions"
|
||||||
|
LLM_MODEL = "THUDM-GLM4-32B"
|
||||||
|
|
||||||
|
SYSTEM_PROMPT = """You are a dialect coach for folk sayings. You'll receive a fake proverb that sounds slightly mechanical or template-generated. Your job is to make it sound like something a real person would actually say — natural rhythm, casual grammar, the kind of thing you'd overhear at a general store.
|
||||||
|
|
||||||
|
Rules:
|
||||||
|
- Keep the same meaning and core nouns
|
||||||
|
- Fix awkward phrasing, robotic word order, or template artifacts
|
||||||
|
- Make it conversational — contractions, folksy grammar, natural cadence
|
||||||
|
- Keep it SHORT (under 20 words preferred)
|
||||||
|
- If it already sounds natural, return it unchanged
|
||||||
|
- If it's unsalvageable nonsense, respond with: SKIP
|
||||||
|
|
||||||
|
Output ONLY the naturalized saying. No quotes, no explanation."""
|
||||||
|
|
||||||
|
|
||||||
|
def llm_chat_completion(text, max_retries=3):
|
||||||
|
"""Send text for naturalization. Returns (result, error_type)."""
|
||||||
|
import requests
|
||||||
|
|
||||||
|
messages = [
|
||||||
|
{"role": "system", "content": SYSTEM_PROMPT},
|
||||||
|
{"role": "user", "content": text},
|
||||||
|
]
|
||||||
|
|
||||||
|
for attempt in range(max_retries):
|
||||||
|
try:
|
||||||
|
resp = requests.post(LLM_ENDPOINT, json={
|
||||||
|
"model": LLM_MODEL,
|
||||||
|
"messages": messages,
|
||||||
|
"temperature": 0.7,
|
||||||
|
}, timeout=120)
|
||||||
|
|
||||||
|
if resp.status_code == 400:
|
||||||
|
body = resp.text.lower()
|
||||||
|
if any(kw in body for kw in ["context", "token", "length"]):
|
||||||
|
return None, "context_too_long"
|
||||||
|
if attempt < max_retries - 1:
|
||||||
|
time.sleep(2 ** attempt)
|
||||||
|
continue
|
||||||
|
return None, "http_400"
|
||||||
|
|
||||||
|
if resp.status_code in (429, 503):
|
||||||
|
if attempt < max_retries - 1:
|
||||||
|
time.sleep(2 ** (attempt + 1))
|
||||||
|
continue
|
||||||
|
return None, "server_overload"
|
||||||
|
|
||||||
|
resp.raise_for_status()
|
||||||
|
|
||||||
|
try:
|
||||||
|
data = resp.json()
|
||||||
|
content = data["choices"][0]["message"]["content"]
|
||||||
|
if content is None:
|
||||||
|
if attempt < max_retries - 1:
|
||||||
|
time.sleep(1)
|
||||||
|
continue
|
||||||
|
return None, "null_content"
|
||||||
|
return content.strip(), None
|
||||||
|
except (json.JSONDecodeError, KeyError, IndexError) as e:
|
||||||
|
print(f" Parse error (attempt {attempt+1}): {e}", file=sys.stderr)
|
||||||
|
if attempt < max_retries - 1:
|
||||||
|
time.sleep(2 ** attempt)
|
||||||
|
continue
|
||||||
|
return None, "json_error"
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Error (attempt {attempt+1}): {type(e).__name__}: {e}", file=sys.stderr)
|
||||||
|
if attempt < max_retries - 1:
|
||||||
|
time.sleep(2 ** attempt)
|
||||||
|
continue
|
||||||
|
return None, "unexpected"
|
||||||
|
|
||||||
|
return None, "exhausted_retries"
|
||||||
|
|
||||||
|
|
||||||
|
def relaxed_quality_filter(text):
|
||||||
|
"""Relaxed filter: only catches artifacts and nonsense, not noun presence.
|
||||||
|
|
||||||
|
Returns (passed, reason).
|
||||||
|
"""
|
||||||
|
if not text:
|
||||||
|
return False, "empty"
|
||||||
|
|
||||||
|
words = text.split()
|
||||||
|
if len(words) > 25:
|
||||||
|
return False, "too_long"
|
||||||
|
if len(words) < 4:
|
||||||
|
return False, "too_short"
|
||||||
|
|
||||||
|
# Template artifacts
|
||||||
|
if "_" in text:
|
||||||
|
return False, "conceptnet_artifact"
|
||||||
|
if "{" in text or "}" in text:
|
||||||
|
return False, "unfilled_slot"
|
||||||
|
|
||||||
|
# LLM meta-commentary leaks
|
||||||
|
lower = text.lower()
|
||||||
|
if any(kw in lower for kw in ["here's", "here is", "this saying", "i've", "i have",
|
||||||
|
"note:", "explanation:", "bridge word"]):
|
||||||
|
return False, "meta_commentary"
|
||||||
|
|
||||||
|
return True, "pass"
|
||||||
|
|
||||||
|
|
||||||
|
def load_already_processed(output_path):
|
||||||
|
"""Load set of raw_text already processed for resume."""
|
||||||
|
processed = set()
|
||||||
|
counts = {"naturalized": 0, "skipped": 0, "unchanged": 0, "error": 0, "filtered": 0}
|
||||||
|
if output_path.exists():
|
||||||
|
with open(output_path, encoding="utf-8") as f:
|
||||||
|
for line in f:
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
entry = json.loads(line)
|
||||||
|
processed.add(entry.get("raw_text", ""))
|
||||||
|
status = entry.get("naturalize_status", "")
|
||||||
|
if status in counts:
|
||||||
|
counts[status] += 1
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
continue
|
||||||
|
return processed, counts
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Naturalization pass for folksy sayings.")
|
||||||
|
parser.add_argument("--input", default=str(CORPUS_DIR / "corpus_polished.jsonl"),
|
||||||
|
help="Input polished JSONL file")
|
||||||
|
parser.add_argument("--output", default=str(CORPUS_DIR / "corpus_naturalized.jsonl"),
|
||||||
|
help="Output naturalized JSONL file")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
input_path = Path(args.input)
|
||||||
|
output_path = Path(args.output)
|
||||||
|
|
||||||
|
if not input_path.exists():
|
||||||
|
print(f"Error: {input_path} not found.", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Load all polished entries (both status=polished and status=discarded with raw_text)
|
||||||
|
candidates = []
|
||||||
|
with open(input_path, encoding="utf-8") as f:
|
||||||
|
for line in f:
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
entry = json.loads(line)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
status = entry.get("status", "")
|
||||||
|
if status == "polished":
|
||||||
|
# Already polished — naturalize the polished text
|
||||||
|
candidates.append(entry)
|
||||||
|
elif status == "discarded":
|
||||||
|
# Was discarded by first polish — try naturalizing the raw text
|
||||||
|
raw = entry.get("raw_text", "")
|
||||||
|
if raw and 4 <= len(raw.split()) <= 25 and "_" not in raw:
|
||||||
|
entry["_from_discard"] = True
|
||||||
|
candidates.append(entry)
|
||||||
|
|
||||||
|
print(f"Loaded {len(candidates)} candidates "
|
||||||
|
f"({sum(1 for c in candidates if not c.get('_from_discard'))} polished, "
|
||||||
|
f"{sum(1 for c in candidates if c.get('_from_discard'))} from discards)")
|
||||||
|
|
||||||
|
# Resume check
|
||||||
|
already_processed, prev_counts = load_already_processed(output_path)
|
||||||
|
remaining = [e for e in candidates if e.get("raw_text", "") not in already_processed]
|
||||||
|
|
||||||
|
print(f"Already processed: {len(already_processed)} "
|
||||||
|
f"(naturalized={prev_counts['naturalized']}, "
|
||||||
|
f"unchanged={prev_counts['unchanged']}, "
|
||||||
|
f"skipped={prev_counts['skipped']}, "
|
||||||
|
f"filtered={prev_counts['filtered']}, "
|
||||||
|
f"errors={prev_counts['error']})")
|
||||||
|
print(f"Remaining: {len(remaining)}")
|
||||||
|
|
||||||
|
if not remaining:
|
||||||
|
print("Nothing to process.")
|
||||||
|
return
|
||||||
|
|
||||||
|
naturalized = 0
|
||||||
|
unchanged = 0
|
||||||
|
skipped = 0
|
||||||
|
filtered_out = 0
|
||||||
|
errors = 0
|
||||||
|
consecutive_errors = 0
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(output_path, "a", encoding="utf-8") as out:
|
||||||
|
for i, entry in enumerate(remaining):
|
||||||
|
# Determine what text to send
|
||||||
|
if entry.get("_from_discard"):
|
||||||
|
input_text = entry.get("raw_text", "")
|
||||||
|
else:
|
||||||
|
input_text = entry.get("polished_text", entry.get("raw_text", ""))
|
||||||
|
|
||||||
|
response, error_type = llm_chat_completion(input_text)
|
||||||
|
|
||||||
|
if response is None:
|
||||||
|
entry["naturalize_status"] = "error"
|
||||||
|
entry["naturalize_error"] = error_type
|
||||||
|
errors += 1
|
||||||
|
consecutive_errors += 1
|
||||||
|
|
||||||
|
if consecutive_errors >= 20:
|
||||||
|
print(f"\nFATAL: {consecutive_errors} consecutive errors. Stopping.",
|
||||||
|
file=sys.stderr)
|
||||||
|
out.write(json.dumps(entry, ensure_ascii=False) + "\n")
|
||||||
|
out.flush()
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
elif response.strip().upper() == "SKIP":
|
||||||
|
entry["naturalize_status"] = "skipped"
|
||||||
|
skipped += 1
|
||||||
|
consecutive_errors = 0
|
||||||
|
|
||||||
|
else:
|
||||||
|
cleaned = response.strip()
|
||||||
|
# Strip quotes if wrapped
|
||||||
|
if cleaned.startswith('"') and cleaned.endswith('"'):
|
||||||
|
cleaned = cleaned[1:-1]
|
||||||
|
|
||||||
|
# Apply relaxed quality filter
|
||||||
|
passed, reason = relaxed_quality_filter(cleaned)
|
||||||
|
if not passed:
|
||||||
|
entry["naturalize_status"] = "filtered"
|
||||||
|
entry["naturalize_filter_reason"] = reason
|
||||||
|
filtered_out += 1
|
||||||
|
elif cleaned == input_text:
|
||||||
|
entry["naturalized_text"] = cleaned
|
||||||
|
entry["naturalize_status"] = "unchanged"
|
||||||
|
unchanged += 1
|
||||||
|
else:
|
||||||
|
entry["naturalized_text"] = cleaned
|
||||||
|
entry["naturalize_status"] = "naturalized"
|
||||||
|
naturalized += 1
|
||||||
|
|
||||||
|
consecutive_errors = 0
|
||||||
|
|
||||||
|
# Clean up internal field
|
||||||
|
entry.pop("_from_discard", None)
|
||||||
|
|
||||||
|
out.write(json.dumps(entry, ensure_ascii=False) + "\n")
|
||||||
|
|
||||||
|
if (i + 1) % 10 == 0:
|
||||||
|
out.flush()
|
||||||
|
|
||||||
|
if (i + 1) % 100 == 0:
|
||||||
|
total_done = len(already_processed) + i + 1
|
||||||
|
elapsed = time.time() - start_time
|
||||||
|
rate = (i + 1) / elapsed
|
||||||
|
eta_min = (len(remaining) - (i + 1)) / rate / 60 if rate > 0 else 0
|
||||||
|
print(f" [{total_done}/{len(candidates)}] "
|
||||||
|
f"naturalized={naturalized}, unchanged={unchanged}, "
|
||||||
|
f"skipped={skipped}, filtered={filtered_out}, errors={errors} "
|
||||||
|
f"({rate:.1f}/s, ETA {eta_min:.0f}m)")
|
||||||
|
|
||||||
|
time.sleep(0.1)
|
||||||
|
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print(f"\nInterrupted at {i+1}/{len(remaining)}. Re-run to resume.", file=sys.stderr)
|
||||||
|
|
||||||
|
elapsed = time.time() - start_time
|
||||||
|
total = naturalized + unchanged + skipped + filtered_out + errors
|
||||||
|
print(f"\nSession complete: {total} entries in {elapsed/60:.1f} minutes.")
|
||||||
|
print(f" Naturalized: {naturalized}")
|
||||||
|
print(f" Unchanged: {unchanged}")
|
||||||
|
print(f" Skipped: {skipped}")
|
||||||
|
print(f" Filtered: {filtered_out}")
|
||||||
|
print(f" Errors: {errors}")
|
||||||
|
usable = naturalized + unchanged
|
||||||
|
print(f" Usable: {usable}")
|
||||||
|
print(f"Output: {output_path}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
278
scripts/rebuild_training_pairs.py
Normal file
278
scripts/rebuild_training_pairs.py
Normal file
|
|
@ -0,0 +1,278 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Rebuild training pairs from naturalized corpus.
|
||||||
|
|
||||||
|
Reads corpus_naturalized.jsonl, applies relaxed quality filter,
|
||||||
|
deduplicates, and formats training pairs. Replaces the separate
|
||||||
|
filter_corpus.py + format_training_pairs.py steps.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python3 scripts/rebuild_training_pairs.py
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import csv
|
||||||
|
import json
|
||||||
|
import random
|
||||||
|
import sys
|
||||||
|
from collections import Counter
|
||||||
|
from difflib import SequenceMatcher
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
SCRIPT_DIR = Path(__file__).parent
|
||||||
|
PROJECT_DIR = SCRIPT_DIR.parent
|
||||||
|
CORPUS_DIR = PROJECT_DIR / "corpus"
|
||||||
|
DATA_DIR = PROJECT_DIR / "data"
|
||||||
|
EXAMPLES_DIR = PROJECT_DIR / "examples"
|
||||||
|
|
||||||
|
PERSONAS = ["farmer", "grandmother", "old sailor", "blacksmith", "innkeeper", "shepherd"]
|
||||||
|
|
||||||
|
OPEN_ENDED_PROMPTS = [
|
||||||
|
"Tell me some folk wisdom.",
|
||||||
|
"What do they say?",
|
||||||
|
"Give me a proverb.",
|
||||||
|
"Share some old-time wisdom.",
|
||||||
|
"What's a good saying?",
|
||||||
|
]
|
||||||
|
|
||||||
|
TEMPLATE_NAMES = {
|
||||||
|
"deconstruction": "deconstruction",
|
||||||
|
"denial_of_consequences": "denial of consequences",
|
||||||
|
"ironic_deficiency": "ironic deficiency",
|
||||||
|
"futile_preparation": "futile preparation",
|
||||||
|
"hypocritical_complaint": "hypocritical complaint",
|
||||||
|
"tautological_wisdom": "tautological wisdom",
|
||||||
|
"false_equivalence": "false equivalence",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def is_near_duplicate(text_a, text_b, threshold=0.75):
|
||||||
|
return SequenceMatcher(None, text_a.lower(), text_b.lower()).ratio() > threshold
|
||||||
|
|
||||||
|
|
||||||
|
def deduplicate_within_family(entries):
|
||||||
|
by_family = {}
|
||||||
|
for entry in entries:
|
||||||
|
family = entry.get("meta_template", "unknown")
|
||||||
|
by_family.setdefault(family, []).append(entry)
|
||||||
|
|
||||||
|
kept = []
|
||||||
|
removed = 0
|
||||||
|
|
||||||
|
for family, family_entries in by_family.items():
|
||||||
|
family_kept = []
|
||||||
|
for entry in family_entries:
|
||||||
|
text = entry.get("final_text", "")
|
||||||
|
is_dup = False
|
||||||
|
for existing in family_kept:
|
||||||
|
if is_near_duplicate(text, existing.get("final_text", "")):
|
||||||
|
is_dup = True
|
||||||
|
break
|
||||||
|
if is_dup:
|
||||||
|
removed += 1
|
||||||
|
else:
|
||||||
|
family_kept.append(entry)
|
||||||
|
kept.extend(family_kept)
|
||||||
|
|
||||||
|
return kept, removed
|
||||||
|
|
||||||
|
|
||||||
|
def load_vocab_categories():
|
||||||
|
word_cats = {}
|
||||||
|
vocab_path = DATA_DIR / "folksy_vocab.csv"
|
||||||
|
if vocab_path.exists():
|
||||||
|
with open(vocab_path, newline="", encoding="utf-8") as f:
|
||||||
|
for row in csv.DictReader(f):
|
||||||
|
word = row["word"]
|
||||||
|
cats = [c.strip() for c in row["categories"].split(",") if c.strip()]
|
||||||
|
word_cats[word] = cats
|
||||||
|
return word_cats
|
||||||
|
|
||||||
|
|
||||||
|
def generate_training_pairs(entry, word_cats):
|
||||||
|
text = entry.get("final_text", "")
|
||||||
|
slots = entry.get("slots", {})
|
||||||
|
meta_template = entry.get("meta_template", "")
|
||||||
|
|
||||||
|
source_words = [v for v in slots.values()
|
||||||
|
if v and not v.startswith("a ") and not v.startswith("an ") and len(v) > 1]
|
||||||
|
|
||||||
|
slot_categories = set()
|
||||||
|
for word in source_words:
|
||||||
|
word_lower = word.lower().replace(" ", "_")
|
||||||
|
if word_lower in word_cats:
|
||||||
|
slot_categories.update(word_cats[word_lower])
|
||||||
|
|
||||||
|
pairs = []
|
||||||
|
base = {
|
||||||
|
"output": text,
|
||||||
|
"meta_template": meta_template,
|
||||||
|
"source_words": source_words,
|
||||||
|
}
|
||||||
|
|
||||||
|
if source_words:
|
||||||
|
word = random.choice(source_words)
|
||||||
|
pairs.append({**base, "input": f"Tell me something about {word}."})
|
||||||
|
|
||||||
|
if slot_categories:
|
||||||
|
cat = random.choice(list(slot_categories))
|
||||||
|
pairs.append({**base, "input": f"Tell me a saying about {cat}."})
|
||||||
|
|
||||||
|
persona = random.choice(PERSONAS)
|
||||||
|
if source_words:
|
||||||
|
word = random.choice(source_words)
|
||||||
|
pairs.append({**base, "input": f"What would a {persona} say about {word}?"})
|
||||||
|
|
||||||
|
if random.random() < 0.7:
|
||||||
|
template_name = TEMPLATE_NAMES.get(meta_template, meta_template)
|
||||||
|
pairs.append({**base, "input": f"Give me a {template_name} proverb."})
|
||||||
|
|
||||||
|
if random.random() < 0.3:
|
||||||
|
prompt = random.choice(OPEN_ENDED_PROMPTS)
|
||||||
|
pairs.append({**base, "input": prompt})
|
||||||
|
|
||||||
|
return pairs
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Rebuild training pairs from naturalized corpus.")
|
||||||
|
parser.add_argument("--input", default=str(CORPUS_DIR / "corpus_naturalized.jsonl"))
|
||||||
|
parser.add_argument("--output", default=str(CORPUS_DIR / "training_pairs.jsonl"))
|
||||||
|
parser.add_argument("--filtered-output", default=str(CORPUS_DIR / "corpus_filtered.jsonl"))
|
||||||
|
parser.add_argument("--stats-output", default=str(CORPUS_DIR / "corpus_stats.json"))
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
input_path = Path(args.input)
|
||||||
|
output_path = Path(args.output)
|
||||||
|
filtered_path = Path(args.filtered_output)
|
||||||
|
stats_path = Path(args.stats_output)
|
||||||
|
|
||||||
|
if not input_path.exists():
|
||||||
|
print(f"Error: {input_path} not found.", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Load naturalized entries — use naturalized_text if available, else polished_text
|
||||||
|
usable = []
|
||||||
|
total_loaded = 0
|
||||||
|
status_counts = Counter()
|
||||||
|
|
||||||
|
with open(input_path, encoding="utf-8") as f:
|
||||||
|
for line in f:
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
entry = json.loads(line)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
continue
|
||||||
|
total_loaded += 1
|
||||||
|
nat_status = entry.get("naturalize_status", "")
|
||||||
|
status_counts[nat_status] += 1
|
||||||
|
|
||||||
|
if nat_status in ("naturalized", "unchanged"):
|
||||||
|
final = entry.get("naturalized_text", entry.get("polished_text", ""))
|
||||||
|
if final:
|
||||||
|
entry["final_text"] = final
|
||||||
|
usable.append(entry)
|
||||||
|
|
||||||
|
print(f"Loaded {total_loaded} entries from {input_path}")
|
||||||
|
print(f"Status breakdown: {dict(status_counts)}")
|
||||||
|
print(f"Usable (naturalized + unchanged): {len(usable)}")
|
||||||
|
|
||||||
|
# Deduplicate
|
||||||
|
kept, dup_count = deduplicate_within_family(usable)
|
||||||
|
print(f"Near-duplicate removal: {dup_count} removed, {len(kept)} remaining")
|
||||||
|
|
||||||
|
# Write filtered corpus
|
||||||
|
filtered_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
with open(filtered_path, "w", encoding="utf-8") as f:
|
||||||
|
for entry in kept:
|
||||||
|
# Write with final_text as polished_text for compatibility
|
||||||
|
out_entry = {k: v for k, v in entry.items() if k != "final_text"}
|
||||||
|
out_entry["polished_text"] = entry["final_text"]
|
||||||
|
f.write(json.dumps(out_entry, ensure_ascii=False) + "\n")
|
||||||
|
|
||||||
|
print(f"Filtered corpus: {len(kept)} entries -> {filtered_path}")
|
||||||
|
|
||||||
|
# Generate training pairs
|
||||||
|
word_cats = load_vocab_categories()
|
||||||
|
all_pairs = []
|
||||||
|
for entry in kept:
|
||||||
|
pairs = generate_training_pairs(entry, word_cats)
|
||||||
|
all_pairs.extend(pairs)
|
||||||
|
|
||||||
|
with open(output_path, "w", encoding="utf-8") as f:
|
||||||
|
for pair in all_pairs:
|
||||||
|
f.write(json.dumps(pair, ensure_ascii=False) + "\n")
|
||||||
|
|
||||||
|
# Stats
|
||||||
|
template_counts = Counter(e.get("meta_template", "unknown") for e in kept)
|
||||||
|
input_type_counts = Counter()
|
||||||
|
for pair in all_pairs:
|
||||||
|
inp = pair["input"]
|
||||||
|
if inp.startswith("Tell me something about"):
|
||||||
|
input_type_counts["word_seeded"] += 1
|
||||||
|
elif inp.startswith("Tell me a saying about"):
|
||||||
|
input_type_counts["category_seeded"] += 1
|
||||||
|
elif inp.startswith("What would a"):
|
||||||
|
input_type_counts["persona_seeded"] += 1
|
||||||
|
elif inp.startswith("Give me a") and "proverb" in inp:
|
||||||
|
input_type_counts["template_seeded"] += 1
|
||||||
|
else:
|
||||||
|
input_type_counts["open_ended"] += 1
|
||||||
|
|
||||||
|
# Vocab coverage
|
||||||
|
vocab_words = set()
|
||||||
|
vocab_path = DATA_DIR / "folksy_vocab.csv"
|
||||||
|
if vocab_path.exists():
|
||||||
|
with open(vocab_path, newline="", encoding="utf-8") as f:
|
||||||
|
for row in csv.DictReader(f):
|
||||||
|
vocab_words.add(row["word"])
|
||||||
|
|
||||||
|
used_words = set()
|
||||||
|
for entry in kept:
|
||||||
|
for v in entry.get("slots", {}).values():
|
||||||
|
word = v.lower().replace(" ", "_")
|
||||||
|
if word in vocab_words:
|
||||||
|
used_words.add(word)
|
||||||
|
|
||||||
|
lengths = [len(e["final_text"].split()) for e in kept if e.get("final_text")]
|
||||||
|
|
||||||
|
stats = {
|
||||||
|
"naturalization_input": total_loaded,
|
||||||
|
"naturalization_status": dict(status_counts),
|
||||||
|
"usable_before_dedup": len(usable),
|
||||||
|
"duplicates_removed": dup_count,
|
||||||
|
"final_filtered": len(kept),
|
||||||
|
"training_pairs": len(all_pairs),
|
||||||
|
"by_template": dict(sorted(template_counts.items())),
|
||||||
|
"by_input_type": dict(sorted(input_type_counts.items())),
|
||||||
|
"vocab_coverage": f"{len(used_words)}/{len(vocab_words)} ({len(used_words)/len(vocab_words)*100:.1f}%)" if vocab_words else "N/A",
|
||||||
|
"avg_length_words": round(sum(lengths) / len(lengths), 1) if lengths else 0,
|
||||||
|
}
|
||||||
|
|
||||||
|
with open(stats_path, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(stats, f, indent=2, ensure_ascii=False)
|
||||||
|
|
||||||
|
print(f"\n{'='*50}")
|
||||||
|
print(f"FINAL CORPUS STATS")
|
||||||
|
print(f"{'='*50}")
|
||||||
|
print(f"Unique sayings: {len(kept)}")
|
||||||
|
print(f"Training pairs: {len(all_pairs)}")
|
||||||
|
print(f"Avg length: {stats['avg_length_words']} words")
|
||||||
|
print(f"Vocab coverage: {stats['vocab_coverage']}")
|
||||||
|
print(f"\nBy template:")
|
||||||
|
for t, c in sorted(template_counts.items()):
|
||||||
|
pct = c / len(kept) * 100
|
||||||
|
flag = " <-- below 10%" if pct < 10 else ""
|
||||||
|
print(f" {t:30s} {c:5d} ({pct:5.1f}%){flag}")
|
||||||
|
print(f"\nBy input type:")
|
||||||
|
for t, c in sorted(input_type_counts.items()):
|
||||||
|
print(f" {t:20s} {c:5d}")
|
||||||
|
print(f"\nOutputs:")
|
||||||
|
print(f" {filtered_path}")
|
||||||
|
print(f" {output_path}")
|
||||||
|
print(f" {stats_path}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Loading…
Add table
Add a link
Reference in a new issue