corpus generation (work from mid february)
This commit is contained in:
parent
8c8a058301
commit
356b62c6ea
16 changed files with 25872 additions and 38 deletions
215
scripts/polish_corpus.py
Normal file
215
scripts/polish_corpus.py
Normal file
|
|
@ -0,0 +1,215 @@
|
|||
#!/usr/bin/env python3
|
||||
"""LLM polish pipeline for raw folksy sayings.
|
||||
|
||||
Reads corpus_raw.jsonl, sends each to GLM4-32B for polish.
|
||||
Output file is the checkpoint — append mode with resume detection.
|
||||
|
||||
Usage:
|
||||
python scripts/polish_corpus.py
|
||||
python scripts/polish_corpus.py --input corpus/corpus_raw.jsonl --output corpus/corpus_polished.jsonl
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
SCRIPT_DIR = Path(__file__).parent
|
||||
PROJECT_DIR = SCRIPT_DIR.parent
|
||||
CORPUS_DIR = PROJECT_DIR / "corpus"
|
||||
|
||||
LLM_ENDPOINT = "http://192.168.1.100:8853/v1d/chat/completions"
|
||||
LLM_MODEL = "THUDM-GLM4-32B"
|
||||
|
||||
|
||||
SYSTEM_PROMPT = """You are an editor specializing in folk sayings and rural proverbs. You will receive a rough draft of a fake folksy saying along with the relationship chain it encodes.
|
||||
|
||||
Your job:
|
||||
1. Fix grammar, articles, and pluralization
|
||||
2. Make it sound natural — like something a weathered farmer would say while leaning on a fence post
|
||||
3. Preserve the core nouns and the relationship between them — do not swap out the key words
|
||||
4. You MAY add small colorful details (adjectives, folksy verb choices, regional flavor) but keep it concise — real proverbs are short
|
||||
5. You MAY lightly restructure the sentence for better rhythm, but keep the same meaning pattern
|
||||
6. If the saying is unsalvageable nonsense (the nouns don't relate in any meaningful way, or the combination is unintentionally offensive), respond with exactly: DISCARD
|
||||
|
||||
Output ONLY the polished saying on a single line. No quotes, no explanation, no preamble.
|
||||
|
||||
Examples of good polish:
|
||||
|
||||
Raw: "Don't build the coffee and act surprised when the water show up."
|
||||
Chain: coffee MadeOf water
|
||||
Polished: Don't brew the coffee and act surprised when the water's all gone.
|
||||
|
||||
Raw: "The chest's children always goes without hold books."
|
||||
Chain: chest UsedFor hold_books
|
||||
Polished: The bookshelf-maker's kids always end up reading off the floor.
|
||||
|
||||
Raw: "A pineapple is just a nectarine that's got an attitude."
|
||||
Chain: pineapple IsA fruit, nectarine IsA fruit, pineapple HasProperty prickly
|
||||
Polished: A pineapple is just a peach that grew itself some armor.
|
||||
|
||||
Raw: "You know what they say, a steel with no iron is just a harder than gold iron."
|
||||
Chain: steel MadeOf iron, steel HasProperty hard
|
||||
Polished: You know what they say — steel without the iron is just a dream of being hard.
|
||||
|
||||
Raw: "Funny how the bamboo never has enough grow very quickly for itself."
|
||||
Chain: bamboo CapableOf grow_quickly
|
||||
Polished: DISCARD
|
||||
|
||||
Raw: "That's just funning the canoe and praying for boiling food."
|
||||
Chain: canoe UsedFor transport, fire UsedFor boiling_food
|
||||
Polished: DISCARD"""
|
||||
|
||||
|
||||
def llm_chat_completion(messages, max_retries=3):
|
||||
"""Chat completion with retry logic."""
|
||||
import requests
|
||||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
resp = requests.post(LLM_ENDPOINT, json={
|
||||
"model": LLM_MODEL,
|
||||
"messages": messages,
|
||||
}, timeout=120)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
return data["choices"][0]["message"]["content"].strip()
|
||||
except Exception as e:
|
||||
wait = (2 ** attempt)
|
||||
print(f" LLM error (attempt {attempt+1}/{max_retries}): {e}", file=sys.stderr)
|
||||
if attempt < max_retries - 1:
|
||||
time.sleep(wait)
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def format_chain(chain_edges):
|
||||
"""Format chain_edges list into readable string for LLM context."""
|
||||
if not chain_edges:
|
||||
return "(no chain data)"
|
||||
parts = []
|
||||
for edge in chain_edges:
|
||||
start = edge.get("start", "?")
|
||||
rel = edge.get("relation", "?")
|
||||
end = edge.get("end", "?")
|
||||
weight = edge.get("weight", 0)
|
||||
parts.append(f"{start} --{rel}--> {end} (w:{weight:.1f})")
|
||||
return ", ".join(parts)
|
||||
|
||||
|
||||
def format_slots(slots):
|
||||
"""Format slots dict for LLM context."""
|
||||
return ", ".join(f"{k}={v}" for k, v in slots.items())
|
||||
|
||||
|
||||
def load_already_processed(output_path):
|
||||
"""Load set of raw_text strings already processed (for resume)."""
|
||||
processed = set()
|
||||
if output_path.exists():
|
||||
with open(output_path, encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
entry = json.loads(line)
|
||||
processed.add(entry.get("raw_text", ""))
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
return processed
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="LLM polish pipeline for folksy sayings.")
|
||||
parser.add_argument("--input", default=str(CORPUS_DIR / "corpus_raw.jsonl"),
|
||||
help="Input JSONL file")
|
||||
parser.add_argument("--output", default=str(CORPUS_DIR / "corpus_polished.jsonl"),
|
||||
help="Output JSONL file (also serves as checkpoint)")
|
||||
args = parser.parse_args()
|
||||
|
||||
input_path = Path(args.input)
|
||||
output_path = Path(args.output)
|
||||
|
||||
if not input_path.exists():
|
||||
print(f"Error: {input_path} not found.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Load raw entries
|
||||
raw_entries = []
|
||||
with open(input_path, encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if line:
|
||||
raw_entries.append(json.loads(line))
|
||||
|
||||
print(f"Loaded {len(raw_entries)} raw entries from {input_path}")
|
||||
|
||||
# Check what's already been processed
|
||||
already_processed = load_already_processed(output_path)
|
||||
remaining = [e for e in raw_entries if e.get("raw_text", "") not in already_processed]
|
||||
|
||||
print(f"Already processed: {len(already_processed)}")
|
||||
print(f"Remaining: {len(remaining)}")
|
||||
|
||||
if not remaining:
|
||||
print("Nothing to process.")
|
||||
return
|
||||
|
||||
discards = 0
|
||||
polished = 0
|
||||
errors = 0
|
||||
|
||||
with open(output_path, "a", encoding="utf-8") as out:
|
||||
for i, entry in enumerate(remaining):
|
||||
raw_text = entry.get("raw_text", "")
|
||||
meta_template = entry.get("meta_template", "")
|
||||
chain = format_chain(entry.get("chain", []))
|
||||
slots = format_slots(entry.get("slots", {}))
|
||||
|
||||
user_prompt = (
|
||||
f"Meta-template: {meta_template}\n"
|
||||
f"Relationship chain: {chain}\n"
|
||||
f"Slot fills: {slots}\n"
|
||||
f"Raw saying: {raw_text}"
|
||||
)
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{"role": "user", "content": user_prompt},
|
||||
]
|
||||
|
||||
response = llm_chat_completion(messages)
|
||||
|
||||
if response is None:
|
||||
entry["status"] = "error"
|
||||
errors += 1
|
||||
elif response.strip().upper() == "DISCARD":
|
||||
entry["status"] = "discarded"
|
||||
discards += 1
|
||||
else:
|
||||
entry["polished_text"] = response.strip()
|
||||
entry["status"] = "polished"
|
||||
polished += 1
|
||||
|
||||
out.write(json.dumps(entry, ensure_ascii=False) + "\n")
|
||||
|
||||
if (i + 1) % 100 == 0:
|
||||
out.flush()
|
||||
total_done = len(already_processed) + i + 1
|
||||
print(f" [{total_done}/{len(raw_entries)}] "
|
||||
f"polished={polished}, discarded={discards}, errors={errors}")
|
||||
|
||||
time.sleep(0.1)
|
||||
|
||||
total_done = len(already_processed) + len(remaining)
|
||||
print(f"\nDone: {total_done} total entries processed.")
|
||||
print(f" Polished: {polished}")
|
||||
print(f" Discarded: {discards}")
|
||||
print(f" Errors: {errors}")
|
||||
print(f" Discard rate: {discards/(polished+discards)*100:.1f}%" if (polished+discards) else " N/A")
|
||||
print(f"Output: {output_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue