folksy_idioms/scripts/polish_corpus.py

215 lines
7.6 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""LLM polish pipeline for raw folksy sayings.
Reads corpus_raw.jsonl, sends each to GLM4-32B for polish.
Output file is the checkpoint append mode with resume detection.
Usage:
python scripts/polish_corpus.py
python scripts/polish_corpus.py --input corpus/corpus_raw.jsonl --output corpus/corpus_polished.jsonl
"""
import argparse
import json
import sys
import time
from pathlib import Path
SCRIPT_DIR = Path(__file__).parent
PROJECT_DIR = SCRIPT_DIR.parent
CORPUS_DIR = PROJECT_DIR / "corpus"
LLM_ENDPOINT = "http://192.168.1.100:8853/v1d/chat/completions"
LLM_MODEL = "THUDM-GLM4-32B"
SYSTEM_PROMPT = """You are an editor specializing in folk sayings and rural proverbs. You will receive a rough draft of a fake folksy saying along with the relationship chain it encodes.
Your job:
1. Fix grammar, articles, and pluralization
2. Make it sound natural like something a weathered farmer would say while leaning on a fence post
3. Preserve the core nouns and the relationship between them do not swap out the key words
4. You MAY add small colorful details (adjectives, folksy verb choices, regional flavor) but keep it concise real proverbs are short
5. You MAY lightly restructure the sentence for better rhythm, but keep the same meaning pattern
6. If the saying is unsalvageable nonsense (the nouns don't relate in any meaningful way, or the combination is unintentionally offensive), respond with exactly: DISCARD
Output ONLY the polished saying on a single line. No quotes, no explanation, no preamble.
Examples of good polish:
Raw: "Don't build the coffee and act surprised when the water show up."
Chain: coffee MadeOf water
Polished: Don't brew the coffee and act surprised when the water's all gone.
Raw: "The chest's children always goes without hold books."
Chain: chest UsedFor hold_books
Polished: The bookshelf-maker's kids always end up reading off the floor.
Raw: "A pineapple is just a nectarine that's got an attitude."
Chain: pineapple IsA fruit, nectarine IsA fruit, pineapple HasProperty prickly
Polished: A pineapple is just a peach that grew itself some armor.
Raw: "You know what they say, a steel with no iron is just a harder than gold iron."
Chain: steel MadeOf iron, steel HasProperty hard
Polished: You know what they say steel without the iron is just a dream of being hard.
Raw: "Funny how the bamboo never has enough grow very quickly for itself."
Chain: bamboo CapableOf grow_quickly
Polished: DISCARD
Raw: "That's just funning the canoe and praying for boiling food."
Chain: canoe UsedFor transport, fire UsedFor boiling_food
Polished: DISCARD"""
def llm_chat_completion(messages, max_retries=3):
"""Chat completion with retry logic."""
import requests
for attempt in range(max_retries):
try:
resp = requests.post(LLM_ENDPOINT, json={
"model": LLM_MODEL,
"messages": messages,
}, timeout=120)
resp.raise_for_status()
data = resp.json()
return data["choices"][0]["message"]["content"].strip()
except Exception as e:
wait = (2 ** attempt)
print(f" LLM error (attempt {attempt+1}/{max_retries}): {e}", file=sys.stderr)
if attempt < max_retries - 1:
time.sleep(wait)
else:
return None
def format_chain(chain_edges):
"""Format chain_edges list into readable string for LLM context."""
if not chain_edges:
return "(no chain data)"
parts = []
for edge in chain_edges:
start = edge.get("start", "?")
rel = edge.get("relation", "?")
end = edge.get("end", "?")
weight = edge.get("weight", 0)
parts.append(f"{start} --{rel}--> {end} (w:{weight:.1f})")
return ", ".join(parts)
def format_slots(slots):
"""Format slots dict for LLM context."""
return ", ".join(f"{k}={v}" for k, v in slots.items())
def load_already_processed(output_path):
"""Load set of raw_text strings already processed (for resume)."""
processed = set()
if output_path.exists():
with open(output_path, encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
try:
entry = json.loads(line)
processed.add(entry.get("raw_text", ""))
except json.JSONDecodeError:
continue
return processed
def main():
parser = argparse.ArgumentParser(description="LLM polish pipeline for folksy sayings.")
parser.add_argument("--input", default=str(CORPUS_DIR / "corpus_raw.jsonl"),
help="Input JSONL file")
parser.add_argument("--output", default=str(CORPUS_DIR / "corpus_polished.jsonl"),
help="Output JSONL file (also serves as checkpoint)")
args = parser.parse_args()
input_path = Path(args.input)
output_path = Path(args.output)
if not input_path.exists():
print(f"Error: {input_path} not found.", file=sys.stderr)
sys.exit(1)
# Load raw entries
raw_entries = []
with open(input_path, encoding="utf-8") as f:
for line in f:
line = line.strip()
if line:
raw_entries.append(json.loads(line))
print(f"Loaded {len(raw_entries)} raw entries from {input_path}")
# Check what's already been processed
already_processed = load_already_processed(output_path)
remaining = [e for e in raw_entries if e.get("raw_text", "") not in already_processed]
print(f"Already processed: {len(already_processed)}")
print(f"Remaining: {len(remaining)}")
if not remaining:
print("Nothing to process.")
return
discards = 0
polished = 0
errors = 0
with open(output_path, "a", encoding="utf-8") as out:
for i, entry in enumerate(remaining):
raw_text = entry.get("raw_text", "")
meta_template = entry.get("meta_template", "")
chain = format_chain(entry.get("chain", []))
slots = format_slots(entry.get("slots", {}))
user_prompt = (
f"Meta-template: {meta_template}\n"
f"Relationship chain: {chain}\n"
f"Slot fills: {slots}\n"
f"Raw saying: {raw_text}"
)
messages = [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": user_prompt},
]
response = llm_chat_completion(messages)
if response is None:
entry["status"] = "error"
errors += 1
elif response.strip().upper() == "DISCARD":
entry["status"] = "discarded"
discards += 1
else:
entry["polished_text"] = response.strip()
entry["status"] = "polished"
polished += 1
out.write(json.dumps(entry, ensure_ascii=False) + "\n")
if (i + 1) % 100 == 0:
out.flush()
total_done = len(already_processed) + i + 1
print(f" [{total_done}/{len(raw_entries)}] "
f"polished={polished}, discarded={discards}, errors={errors}")
time.sleep(0.1)
total_done = len(already_processed) + len(remaining)
print(f"\nDone: {total_done} total entries processed.")
print(f" Polished: {polished}")
print(f" Discarded: {discards}")
print(f" Errors: {errors}")
print(f" Discard rate: {discards/(polished+discards)*100:.1f}%" if (polished+discards) else " N/A")
print(f"Output: {output_path}")
if __name__ == "__main__":
main()