corpus generation (work from mid february)

This commit is contained in:
John McCardle 2026-03-09 19:52:09 -04:00
commit 356b62c6ea
16 changed files with 25872 additions and 38 deletions

View file

@ -0,0 +1,213 @@
#!/usr/bin/env python3
"""Compute corpus statistics and validation metrics.
Reads corpus files and computes counts, distributions, coverage, and balance warnings.
Usage:
python scripts/compute_corpus_stats.py
python scripts/compute_corpus_stats.py --corpus-dir corpus/
"""
import argparse
import csv
import json
import sys
from collections import Counter
from pathlib import Path
SCRIPT_DIR = Path(__file__).parent
PROJECT_DIR = SCRIPT_DIR.parent
DATA_DIR = PROJECT_DIR / "data"
def load_jsonl(path):
"""Load a JSONL file."""
entries = []
if not path.exists():
return entries
with open(path, encoding="utf-8") as f:
for line in f:
line = line.strip()
if line:
entries.append(json.loads(line))
return entries
def classify_input_type(inp):
"""Classify the input framing type of a training pair."""
if inp.startswith("Tell me something about"):
return "word_seeded"
elif inp.startswith("Tell me a saying about"):
return "category_seeded"
elif inp.startswith("What would a"):
return "persona_seeded"
elif inp.startswith("Give me a") and "proverb" in inp:
return "template_seeded"
elif any(inp.startswith(p) for p in [
"Tell me some folk", "What do they", "Give me a proverb",
"Share some", "What's a good"
]):
return "open_ended"
else:
return "fictional"
def main():
parser = argparse.ArgumentParser(description="Compute corpus statistics.")
parser.add_argument("--corpus-dir", default=str(PROJECT_DIR / "corpus"),
help="Corpus directory")
parser.add_argument("--output", default=None,
help="Output JSON file (default: corpus_dir/corpus_stats.json)")
args = parser.parse_args()
corpus_dir = Path(args.corpus_dir)
output_path = Path(args.output) if args.output else corpus_dir / "corpus_stats.json"
# Load all corpus files
raw = load_jsonl(corpus_dir / "corpus_raw.jsonl")
polished = load_jsonl(corpus_dir / "corpus_polished.jsonl")
filtered = load_jsonl(corpus_dir / "corpus_filtered.jsonl")
training = load_jsonl(corpus_dir / "training_pairs.jsonl")
# Load vocab for coverage analysis
vocab_words = set()
vocab_path = DATA_DIR / "folksy_vocab.csv"
if vocab_path.exists():
with open(vocab_path, newline="", encoding="utf-8") as f:
for row in csv.DictReader(f):
vocab_words.add(row["word"])
stats = {}
# --- Raw corpus stats ---
stats["raw_count"] = len(raw)
raw_by_template = Counter(e.get("meta_template", "unknown") for e in raw)
stats["raw_by_template"] = dict(sorted(raw_by_template.items()))
# --- Polish stats ---
polished_entries = [e for e in polished if e.get("status") == "polished"]
discarded_entries = [e for e in polished if e.get("status") == "discarded"]
error_entries = [e for e in polished if e.get("status") == "error"]
stats["polished_count"] = len(polished_entries)
stats["discarded_during_polish"] = len(discarded_entries)
stats["errors_during_polish"] = len(error_entries)
if polished_entries or discarded_entries:
total_processed = len(polished_entries) + len(discarded_entries)
stats["polish_discard_rate"] = f"{len(discarded_entries)/total_processed*100:.1f}%"
polish_by_template = Counter(e.get("meta_template", "unknown") for e in polished_entries)
stats["polished_by_template"] = dict(sorted(polish_by_template.items()))
discard_by_template = Counter(e.get("meta_template", "unknown") for e in discarded_entries)
stats["discarded_by_template"] = dict(sorted(discard_by_template.items()))
# --- Filter stats ---
stats["filtered_count"] = len(filtered)
filter_by_template = Counter(e.get("meta_template", "unknown") for e in filtered)
stats["filtered_by_template"] = dict(sorted(filter_by_template.items()))
# Filter discard count
stats["discarded_during_filter"] = len(polished_entries) - len(filtered)
# --- Training pairs stats ---
stats["training_pair_count"] = len(training)
training_by_template = Counter(e.get("meta_template", "unknown") for e in training)
stats["training_by_template"] = dict(sorted(training_by_template.items()))
input_type_counts = Counter(classify_input_type(e.get("input", "")) for e in training)
stats["training_by_input_type"] = dict(sorted(input_type_counts.items()))
# --- Coverage analysis ---
used_words = set()
for entry in filtered:
slots = entry.get("slots", {})
for v in slots.values():
word = v.lower().replace(" ", "_")
if word in vocab_words:
used_words.add(word)
stats["unique_slot_words_used"] = len(used_words)
stats["total_vocab_words"] = len(vocab_words)
stats["vocab_coverage"] = f"{len(used_words)/len(vocab_words)*100:.1f}%" if vocab_words else "N/A"
never_used = sorted(vocab_words - used_words)
stats["words_never_used"] = never_used
stats["words_never_used_count"] = len(never_used)
# --- Saying length stats ---
lengths = []
for entry in filtered:
text = entry.get("polished_text", "")
if text:
lengths.append(len(text.split()))
if lengths:
stats["avg_saying_length_words"] = round(sum(lengths) / len(lengths), 1)
stats["min_saying_length_words"] = min(lengths)
stats["max_saying_length_words"] = max(lengths)
# --- Balance warnings ---
warnings = []
if filtered:
total_filtered = len(filtered)
for template, count in filter_by_template.items():
pct = count / total_filtered * 100
if pct < 10:
warnings.append(
f"WARNING: {template} has only {count} entries ({pct:.1f}%) — "
f"below 10% threshold. Generate more raw sayings for this family."
)
if training:
total_training = len(training)
for template, count in training_by_template.items():
pct = count / total_training * 100
if pct < 5:
warnings.append(
f"WARNING: {template} has only {count} training pairs ({pct:.1f}%) — very underrepresented."
)
stats["balance_warnings"] = warnings
# --- Write output ---
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, "w", encoding="utf-8") as f:
json.dump(stats, f, indent=2, ensure_ascii=False)
# --- Print summary ---
print("=" * 60)
print("CORPUS STATISTICS")
print("=" * 60)
print(f"\nRaw sayings: {stats['raw_count']}")
print(f"Polished sayings: {stats['polished_count']}")
print(f"Discarded (polish): {stats.get('discarded_during_polish', 0)} ({stats.get('polish_discard_rate', 'N/A')})")
print(f"Discarded (filter): {stats.get('discarded_during_filter', 0)}")
print(f"Final filtered: {stats['filtered_count']}")
print(f"Training pairs: {stats['training_pair_count']}")
print(f"\nDistribution by meta-template (filtered):")
for t, c in sorted(filter_by_template.items()):
pct = c / len(filtered) * 100 if filtered else 0
print(f" {t:30s} {c:5d} ({pct:5.1f}%)")
print(f"\nDistribution by input framing type:")
for t, c in sorted(input_type_counts.items()):
print(f" {t:20s} {c:5d}")
print(f"\nVocab coverage: {stats['vocab_coverage']} ({stats['unique_slot_words_used']}/{stats['total_vocab_words']})")
print(f"Average saying length: {stats.get('avg_saying_length_words', 'N/A')} words")
if warnings:
print(f"\nBalance warnings:")
for w in warnings:
print(f" {w}")
print(f"\nFull stats: {output_path}")
if __name__ == "__main__":
main()

787
scripts/enhance_graph.py Normal file
View file

@ -0,0 +1,787 @@
#!/usr/bin/env python3
"""LLM-augmented graph enhancement for the folksy subgraph.
Three phases:
Phase 1: Per-word relationship expansion
Phase 2: Cross-word bridge discovery
Phase 3: Property enrichment for false_equivalence templates
Usage:
python scripts/enhance_graph.py --phase 1 # Run phase 1 only
python scripts/enhance_graph.py --phase 2 # Run phase 2 only
python scripts/enhance_graph.py --phase 3 # Run phase 3 only
python scripts/enhance_graph.py --all # Run all phases
python scripts/enhance_graph.py --phase 1 --dry-run # Print prompts without calling LLM
"""
import argparse
import csv
import os
import random
import re
import sys
import time
from collections import defaultdict
from datetime import datetime
from pathlib import Path
# Paths
SCRIPT_DIR = Path(__file__).parent
PROJECT_DIR = SCRIPT_DIR.parent
DATA_DIR = PROJECT_DIR / "data"
LLM_ENDPOINT = "http://192.168.1.100:8853/v1d/chat/completions"
LLM_MODEL = "THUDM-GLM4-32B"
VALID_RELATIONS = {
"AtLocation", "MadeOf", "PartOf", "UsedFor", "HasA", "HasProperty",
"Causes", "HasPrerequisite", "CapableOf", "ReceivesAction", "Desires",
"CausesDesire", "LocatedNear", "CreatedBy", "MotivatedByGoal", "HasSubevent",
}
AUGMENTED_CSV = DATA_DIR / "folksy_relations_augmented.csv"
CANDIDATE_CSV = DATA_DIR / "candidate_additions.csv"
LOG_CSV = DATA_DIR / "enhancement_log.csv"
# ---------------------------------------------------------------------------
# Infrastructure
# ---------------------------------------------------------------------------
def llm_chat_completion(messages, max_retries=3):
"""Chat completion with retry logic."""
import requests
for attempt in range(max_retries):
try:
resp = requests.post(LLM_ENDPOINT, json={
"model": LLM_MODEL,
"messages": messages,
}, timeout=120)
resp.raise_for_status()
data = resp.json()
return data["choices"][0]["message"]["content"]
except Exception as e:
wait = (2 ** attempt)
print(f" LLM call failed (attempt {attempt+1}/{max_retries}): {e}", file=sys.stderr)
if attempt < max_retries - 1:
print(f" Retrying in {wait}s...", file=sys.stderr)
time.sleep(wait)
else:
print(f" Giving up on this word.", file=sys.stderr)
return None
def load_vocab():
"""Load folksy vocabulary."""
vocab = {}
with open(DATA_DIR / "folksy_vocab.csv", newline="", encoding="utf-8") as f:
for row in csv.DictReader(f):
word = row["word"]
cats = [c.strip() for c in row["categories"].split(",") if c.strip()]
vocab[word] = {
"categories": cats,
"tangibility": float(row.get("tangibility_score", 0)),
"edge_count": int(row.get("conceptnet_edge_count", 0)),
}
return vocab
def load_relations():
"""Load existing relations (ConceptNet + any existing augmented)."""
edges = defaultdict(list) # (start, relation) -> [(end, weight, surface)]
existing_triples = set() # (start, end, relation) for dedup
for path in [DATA_DIR / "folksy_relations.csv", AUGMENTED_CSV]:
if not path.exists():
continue
with open(path, newline="", encoding="utf-8") as f:
for row in csv.DictReader(f):
sw = row["start_word"]
ew = row["end_word"]
rel = row["relation"]
if not row['weight']: continue # corruption / skip?
w = float(row["weight"])
surf = row.get("surface_text", "")
edges[(sw, rel)].append((ew, w, surf))
existing_triples.add((sw, ew, rel))
return edges, existing_triples
def load_checkpoint():
"""Load enhancement log to determine what's already been processed."""
processed = set() # (word, phase)
if LOG_CSV.exists():
with open(LOG_CSV, newline="", encoding="utf-8") as f:
for row in csv.DictReader(f):
processed.add((row["source_word"], row["phase"]))
return processed
def append_log(word, phase, edges_generated, edges_accepted, edges_duplicate, edges_oov):
"""Append a row to the enhancement log."""
write_header = not LOG_CSV.exists()
with open(LOG_CSV, "a", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
if write_header:
writer.writerow(["source_word", "phase", "timestamp",
"edges_generated", "edges_accepted", "edges_duplicate", "edges_oov"])
writer.writerow([word, phase, datetime.now().isoformat(),
edges_generated, edges_accepted, edges_duplicate, edges_oov])
def append_augmented_edges(edges):
"""Append edges to the augmented relations CSV."""
write_header = not AUGMENTED_CSV.exists()
with open(AUGMENTED_CSV, "a", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
if write_header:
writer.writerow(["start_word", "end_word", "relation", "weight", "surface_text", "source"])
for e in edges:
writer.writerow([e["start_word"], e["end_word"], e["relation"],
e["weight"], e["surface_text"], e["source"]])
def append_candidates(candidates):
"""Append candidate words to the candidate additions CSV."""
write_header = not CANDIDATE_CSV.exists()
with open(CANDIDATE_CSV, "a", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
if write_header:
writer.writerow(["word", "suggested_by", "relation_context", "frequency"])
for c in candidates:
writer.writerow([c["word"], c["suggested_by"], c["relation_context"], c["frequency"]])
# ---------------------------------------------------------------------------
# Parsing
# ---------------------------------------------------------------------------
def parse_llm_relations(response_text, source_word):
"""Parse structured LLM output into edge dicts.
Handles bullets, numbering, extra whitespace, multi-word targets.
"""
edges = []
if not response_text:
return edges
for line in response_text.strip().split("\n"):
line = line.strip()
if not line:
continue
# Strip leading bullets/numbers: "- ", "1. ", "* ", etc.
line = re.sub(r"^[\d]+[.)]\s*", "", line)
line = re.sub(r"^[-*•]\s*", "", line)
line = line.strip()
if not line or "NONE" in line.upper():
continue
# Match: RELATION_TYPE: target_word(s) | surface text
match = re.match(r"^(\w+):\s*(.+?)\s*\|\s*(.+)$", line)
if not match:
continue
relation, target_raw, surface = match.groups()
relation = relation.strip()
if relation not in VALID_RELATIONS:
continue
# Normalize target: lowercase, replace spaces with underscores for multi-word
target = target_raw.strip().lower()
target = re.sub(r"\s+", "_", target)
# Skip self-loops
if target == source_word:
continue
edges.append({
"start_word": source_word,
"end_word": target,
"relation": relation,
"weight": 0.8,
"surface_text": surface.strip(),
"source": "llm_augmented",
})
return edges
def parse_bridge_response(response_text, word_a, word_b):
"""Parse bridge discovery LLM output."""
edges = []
if not response_text:
return edges
for line in response_text.strip().split("\n"):
line = line.strip()
if not line:
continue
# Strip common prefixes
line = re.sub(r"^[\d]+[.)]\s*", "", line)
line = re.sub(r"^[-*•]\s*", "", line)
line = re.sub(r"^BRIDGE:\s*", "", line, flags=re.IGNORECASE)
line = line.strip()
if not line:
continue
# BRIDGE_WORD | relation_to_first: TYPE | relation_to_second: TYPE | explanation
parts = [p.strip() for p in line.split("|")]
if len(parts) < 3:
continue
bridge_word = parts[0].strip().lower().replace(" ", "_")
# Parse relation_to_first
rel1_match = re.search(r"(?:relation_to_first|first):\s*(\w+)", parts[1], re.IGNORECASE)
rel2_match = re.search(r"(?:relation_to_second|second):\s*(\w+)", parts[2], re.IGNORECASE)
if not rel1_match or not rel2_match:
# Try simpler format: just the relation type
rel1_match = re.match(r"(\w+)", parts[1].split(":")[-1].strip())
rel2_match = re.match(r"(\w+)", parts[2].split(":")[-1].strip())
if not rel1_match or not rel2_match:
continue
rel1 = rel1_match.group(1)
rel2 = rel2_match.group(1)
if rel1 not in VALID_RELATIONS or rel2 not in VALID_RELATIONS:
continue
explanation = parts[3].strip() if len(parts) > 3 else ""
# Create edges: word_a -> bridge and bridge -> word_b
edges.append({
"start_word": word_a,
"end_word": bridge_word,
"relation": rel1,
"weight": 0.8,
"surface_text": explanation,
"source": "llm_bridge",
})
edges.append({
"start_word": bridge_word,
"end_word": word_b,
"relation": rel2,
"weight": 0.8,
"surface_text": explanation,
"source": "llm_bridge",
})
return edges
def parse_property_response(response_text, word):
"""Parse property enrichment LLM output."""
edges = []
if not response_text:
return edges
for line in response_text.strip().split("\n"):
line = line.strip()
if not line:
continue
line = re.sub(r"^[\d]+[.)]\s*", "", line)
line = re.sub(r"^[-*•]\s*", "", line)
line = line.strip()
if not line:
continue
# PROPERTY | explanation
parts = [p.strip() for p in line.split("|")]
if len(parts) < 1:
continue
prop = parts[0].strip().lower().replace(" ", "_")
explanation = parts[1].strip() if len(parts) > 1 else f"{word} is {prop}"
if not prop or prop == word:
continue
edges.append({
"start_word": word,
"end_word": prop,
"relation": "HasProperty",
"weight": 0.8,
"surface_text": explanation,
"source": "llm_property",
})
return edges
# ---------------------------------------------------------------------------
# Phase 1: Per-Word Expansion
# ---------------------------------------------------------------------------
PHASE1_SYSTEM = """You are a commonsense knowledge annotator. You will be given a concrete noun and its known relationships. Your job is to generate ADDITIONAL commonsense relationships that are missing.
Rules:
- Only generate relationships involving concrete, tangible things (animals, foods, tools, plants, buildings, weather, landscape, household objects)
- Every relationship must be something a typical adult would agree is true
- Do not repeat any relationship already listed as "known"
- Target words should be common English words (top 3000 frequency preferred)
- Output ONLY the structured format shown below, one relationship per line
- If you cannot think of good relationships for a given type, output NONE for that type
- Aim for 3-5 relationships per type where possible
Output format (one per line):
RELATION_TYPE: target_word | short natural phrasing
Example output:
AtLocation: barn | you find a horse in a barn
UsedFor: riding | a horse is used for riding
HasA: mane | a horse has a mane
CapableOf: gallop | a horse can gallop
MadeOf: NONE
PartOf: herd | a horse is part of a herd"""
PHASE1_USER = """Word: {word}
Categories: {categories}
Known relationships:
{existing_edges}
Generate additional relationships for these types:
- AtLocation (where is it found?)
- UsedFor (what is it used for?)
- HasA (what does it have / contain?)
- PartOf (what is it part of?)
- CapableOf (what can it do?)
- MadeOf (what is it made of?)
- HasPrerequisite (what do you need before you can have/use it?)
- Causes (what does it cause or lead to?)
- HasProperty (what adjectives describe it? limit to physical/sensory properties)"""
def format_existing_edges(edges_dict, word):
"""Format existing edges for a word grouped by relation type."""
relation_types = ["AtLocation", "UsedFor", "HasA", "PartOf", "CapableOf",
"MadeOf", "HasPrerequisite", "Causes", "HasProperty"]
lines = []
for rel in relation_types:
targets = edges_dict.get((word, rel), [])
if targets:
formatted = ", ".join(f"{t[0]} (weight {t[1]:.1f})" for t in targets[:10])
lines.append(f"{rel}: {formatted}")
else:
lines.append(f"{rel}: (none in database)")
return "\n".join(lines)
def run_phase1(vocab, edges, existing_triples, checkpoint, dry_run=False):
"""Phase 1: Per-word relationship expansion."""
words = sorted(vocab.keys())
total = len(words)
total_accepted = 0
total_skipped = 0
print(f"Phase 1: Processing {total} words...")
for i, word in enumerate(words):
if (word, "1") in checkpoint:
total_skipped += 1
continue
categories = ", ".join(vocab[word]["categories"])
existing = format_existing_edges(edges, word)
user_prompt = PHASE1_USER.format(
word=word, categories=categories, existing_edges=existing
)
messages = [
{"role": "system", "content": PHASE1_SYSTEM},
{"role": "user", "content": user_prompt},
]
if dry_run:
if i < 3: # Show first 3 prompts
print(f"\n--- Prompt for '{word}' ---")
print(f"System: {PHASE1_SYSTEM[:200]}...")
print(f"User:\n{user_prompt}")
elif i == 3:
print(f"\n... ({total - 3} more words) ...")
continue
response = llm_chat_completion(messages)
parsed = parse_llm_relations(response, word) if response else []
# Classify edges
accepted = []
candidates = []
duplicates = 0
for edge in parsed:
triple = (edge["start_word"], edge["end_word"], edge["relation"])
if triple in existing_triples:
duplicates += 1
continue
existing_triples.add(triple)
if edge["end_word"] in vocab:
accepted.append(edge)
else:
candidates.append({
"word": edge["end_word"],
"suggested_by": word,
"relation_context": f"{edge['relation']}: {edge['surface_text']}",
"frequency": 1,
})
if accepted:
append_augmented_edges(accepted)
# Also update in-memory edges for subsequent words
for e in accepted:
edges[(e["start_word"], e["relation"])].append(
(e["end_word"], e["weight"], e["surface_text"]))
if candidates:
append_candidates(candidates)
total_accepted += len(accepted)
append_log(word, "1", len(parsed), len(accepted), duplicates, len(candidates))
if (i + 1) % 50 == 0:
print(f" [{i+1}/{total}] {total_accepted} edges accepted so far")
time.sleep(0.1)
if dry_run:
print(f"\nDry run complete. Would process {total - total_skipped} words.")
else:
print(f"\nPhase 1 complete: {total_accepted} new edges accepted.")
# ---------------------------------------------------------------------------
# Phase 2: Cross-Word Bridge Discovery
# ---------------------------------------------------------------------------
PHASE2_SYSTEM = """You are a commonsense knowledge annotator. You will be given two concrete nouns. Your job is to identify a BRIDGE word that connects them — something that relates to both.
Rules:
- The bridge word must be a common, concrete noun
- State the relationship type for each connection
- Valid relationship types: AtLocation, UsedFor, HasA, PartOf, CapableOf, MadeOf, HasPrerequisite, Causes, HasProperty, ReceivesAction, Desires, CausesDesire, LocatedNear, CreatedBy
- Output format: BRIDGE_WORD | relation_to_first: TYPE | relation_to_second: TYPE | explanation
Example:
Words: "cow" and "butter"
milk | relation_to_first: CapableOf | relation_to_second: MadeOf | milk connects production to product"""
PHASE2_USER = """Words: "{word_a}" and "{word_b}"
Categories: {word_a} is {categories_a}, {word_b} is {categories_b}
Find 1-3 bridge words that connect them."""
def build_reachability(vocab, edges):
"""Build 2-hop reachability from vocab words to other vocab words."""
vocab_set = set(vocab.keys())
reachable = defaultdict(set) # word -> set of reachable vocab words
for word in vocab:
# Direct (1-hop) neighbors in vocab
for (sw, rel), targets in edges.items():
if sw == word:
for (ew, w, s) in targets:
if ew in vocab_set and ew != word:
reachable[word].add(ew)
# 2-hop from this neighbor
for (sw2, rel2), targets2 in edges.items():
if sw2 == ew:
for (ew2, w2, s2) in targets2:
if ew2 in vocab_set and ew2 != word:
reachable[word].add(ew2)
return reachable
def run_phase2(vocab, edges, existing_triples, checkpoint, dry_run=False):
"""Phase 2: Cross-word bridge discovery."""
print("Phase 2: Building reachability matrix...")
reachable = build_reachability(vocab, edges)
# Find low-connectivity words
vocab_set = set(vocab.keys())
low_connectivity = []
for word in vocab:
reach_count = len(reachable.get(word, set()))
if reach_count < 10:
low_connectivity.append((word, reach_count))
low_connectivity.sort(key=lambda x: x[1])
print(f" {len(low_connectivity)} words with <10 reachable vocab words")
# Build category index
by_category = defaultdict(list)
for word, info in vocab.items():
for cat in info["categories"]:
by_category[cat].append(word)
total_accepted = 0
pairs_processed = 0
total_skipped = 0
for word, reach_count in low_connectivity:
if (word, "2") in checkpoint:
total_skipped += 1
continue
word_cats = vocab[word]["categories"]
word_reachable = reachable.get(word, set())
# Find same-category words that are unreachable
unreachable = []
for cat in word_cats:
for peer in by_category.get(cat, []):
if peer != word and peer not in word_reachable:
unreachable.append(peer)
if not unreachable:
append_log(word, "2", 0, 0, 0, 0)
continue
# Sample 5-10 unreachable peers
sample = random.sample(unreachable, min(10, len(unreachable)))
accepted_for_word = 0
for peer in sample:
pair_key = f"{word}:{peer}"
if (pair_key, "2") in checkpoint:
continue
categories_a = ", ".join(vocab[word]["categories"])
categories_b = ", ".join(vocab[peer]["categories"])
user_prompt = PHASE2_USER.format(
word_a=word, word_b=peer,
categories_a=categories_a, categories_b=categories_b,
)
messages = [
{"role": "system", "content": PHASE2_SYSTEM},
{"role": "user", "content": user_prompt},
]
if dry_run:
if pairs_processed < 3:
print(f"\n--- Bridge prompt: '{word}' <-> '{peer}' ---")
print(f"User:\n{user_prompt}")
elif pairs_processed == 3:
print(f"\n... (more pairs) ...")
pairs_processed += 1
continue
response = llm_chat_completion(messages)
parsed = parse_bridge_response(response, word, peer) if response else []
accepted = []
duplicates = 0
oov = 0
for edge in parsed:
triple = (edge["start_word"], edge["end_word"], edge["relation"])
if triple in existing_triples:
duplicates += 1
continue
existing_triples.add(triple)
# For bridge edges, both endpoints should ideally be in vocab
if edge["start_word"] in vocab_set and edge["end_word"] in vocab_set:
accepted.append(edge)
elif edge["start_word"] in vocab_set or edge["end_word"] in vocab_set:
# At least one end in vocab — still useful
accepted.append(edge)
else:
oov += 1
if accepted:
append_augmented_edges(accepted)
for e in accepted:
edges[(e["start_word"], e["relation"])].append(
(e["end_word"], e["weight"], e["surface_text"]))
accepted_for_word += len(accepted)
pairs_processed += 1
time.sleep(0.1)
total_accepted += accepted_for_word
append_log(word, "2", 0, accepted_for_word, 0, 0)
if (pairs_processed) % 20 == 0:
print(f" {pairs_processed} pairs processed, {total_accepted} edges accepted")
if dry_run:
print(f"\nDry run complete. Would process {pairs_processed} word pairs.")
else:
print(f"\nPhase 2 complete: {total_accepted} bridge edges accepted from {pairs_processed} pairs.")
# ---------------------------------------------------------------------------
# Phase 3: Property Enrichment
# ---------------------------------------------------------------------------
PHASE3_SYSTEM = """You are a commonsense knowledge annotator. Given a concrete noun, list its most distinctive physical or sensory properties — things you could see, touch, hear, smell, or taste. Also list behavioral properties for animals.
Rules:
- Only physical/sensory/behavioral properties, not abstract qualities
- Properties should DISTINGUISH this thing from similar things in its category
- Output one property per line as: PROPERTY | brief explanation
- Aim for 5-8 properties"""
PHASE3_USER = """Word: {word}
Category: {categories}
Other words in same category: {peers}
What properties distinguish {word} from the others listed?"""
def run_phase3(vocab, edges, existing_triples, checkpoint, dry_run=False):
"""Phase 3: Property enrichment for false_equivalence templates."""
by_category = defaultdict(list)
for word, info in vocab.items():
for cat in info["categories"]:
by_category[cat].append(word)
words = sorted(vocab.keys())
total = len(words)
total_accepted = 0
total_skipped = 0
print(f"Phase 3: Property enrichment for {total} words...")
for i, word in enumerate(words):
if (word, "3") in checkpoint:
total_skipped += 1
continue
word_cats = vocab[word]["categories"]
categories = ", ".join(word_cats)
# Gather same-category peers (sample of 10)
peers = set()
for cat in word_cats:
for peer in by_category.get(cat, []):
if peer != word:
peers.add(peer)
peer_sample = random.sample(list(peers), min(10, len(peers))) if peers else []
if not peer_sample:
append_log(word, "3", 0, 0, 0, 0)
continue
user_prompt = PHASE3_USER.format(
word=word, categories=categories,
peers=", ".join(peer_sample),
)
messages = [
{"role": "system", "content": PHASE3_SYSTEM},
{"role": "user", "content": user_prompt},
]
if dry_run:
if i < 3:
print(f"\n--- Property prompt for '{word}' ---")
print(f"User:\n{user_prompt}")
elif i == 3:
print(f"\n... ({total - 3} more words) ...")
continue
response = llm_chat_completion(messages)
parsed = parse_property_response(response, word) if response else []
accepted = []
duplicates = 0
for edge in parsed:
triple = (edge["start_word"], edge["end_word"], edge["relation"])
if triple in existing_triples:
duplicates += 1
continue
existing_triples.add(triple)
accepted.append(edge)
if accepted:
append_augmented_edges(accepted)
for e in accepted:
edges[(e["start_word"], e["relation"])].append(
(e["end_word"], e["weight"], e["surface_text"]))
total_accepted += len(accepted)
append_log(word, "3", len(parsed), len(accepted), duplicates, 0)
if (i + 1) % 50 == 0:
print(f" [{i+1}/{total}] {total_accepted} properties accepted so far")
time.sleep(0.1)
if dry_run:
print(f"\nDry run complete. Would process {total - total_skipped} words.")
else:
print(f"\nPhase 3 complete: {total_accepted} new HasProperty edges accepted.")
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main():
parser = argparse.ArgumentParser(
description="LLM-augmented graph enhancement for folksy subgraph."
)
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument("--phase", type=int, choices=[1, 2, 3],
help="Run a specific phase (1, 2, or 3)")
group.add_argument("--all", action="store_true",
help="Run all three phases in sequence")
parser.add_argument("--dry-run", action="store_true",
help="Print prompts without calling LLM")
args = parser.parse_args()
vocab = load_vocab()
edges, existing_triples = load_relations()
checkpoint = load_checkpoint()
print(f"Loaded {len(vocab)} vocab words, {len(existing_triples)} existing edge triples.")
print(f"Checkpoint: {len(checkpoint)} (word, phase) pairs already processed.")
phases = [args.phase] if args.phase else [1, 2, 3]
for phase in phases:
print(f"\n{'='*60}")
print(f"Running Phase {phase}")
print(f"{'='*60}")
if phase == 1:
run_phase1(vocab, edges, existing_triples, checkpoint, args.dry_run)
elif phase == 2:
run_phase2(vocab, edges, existing_triples, checkpoint, args.dry_run)
elif phase == 3:
run_phase3(vocab, edges, existing_triples, checkpoint, args.dry_run)
# Reload checkpoint after each phase for resumability
checkpoint = load_checkpoint()
print("\nDone.")
if __name__ == "__main__":
main()

512
scripts/expand_vocab.py Normal file
View file

@ -0,0 +1,512 @@
#!/usr/bin/env python3
"""Expand folksy vocabulary with high-quality candidates from LLM suggestions.
Reads candidate_additions.csv (words suggested by the LLM during phase 1 that
weren't in the vocab), filters for quality, uses the LLM to assign categories,
and appends the survivors to folksy_vocab.csv.
After running this, re-run `enhance_graph.py --phase 1` to generate edges
for the new words (the checkpoint will skip already-processed words).
Usage:
python scripts/expand_vocab.py # Full run
python scripts/expand_vocab.py --dry-run # Show what would be added
python scripts/expand_vocab.py --min-citations 8 # Stricter threshold
"""
import argparse
import csv
import json
import re
import shutil
import sys
import time
from collections import Counter, defaultdict
from datetime import datetime
from pathlib import Path
SCRIPT_DIR = Path(__file__).parent
PROJECT_DIR = SCRIPT_DIR.parent
DATA_DIR = PROJECT_DIR / "data"
LLM_ENDPOINT = "http://192.168.1.100:8853/v1d/chat/completions"
LLM_MODEL = "THUDM-GLM4-32B"
VOCAB_CSV = DATA_DIR / "folksy_vocab.csv"
CANDIDATE_CSV = DATA_DIR / "candidate_additions.csv"
# Valid categories from the existing vocabulary
VALID_CATEGORIES = {
"animal", "beverage", "bird", "building", "clothing", "container", "crop",
"fabric", "fish", "flower", "food", "fruit", "furniture", "grain", "herb",
"insect", "instrument", "landscape", "material", "metal", "mineral",
"organism", "plant", "rock", "seed", "shelter", "spice", "stone",
"structure", "tool", "tree", "vegetable", "vehicle", "water", "weapon", "wood",
}
# ---------------------------------------------------------------------------
# Exclusion lists
# ---------------------------------------------------------------------------
# Abstract concepts, emotions, processes — not concrete enough for folksy vocab
EXCLUDE_ABSTRACT = {
"ecosystem", "satisfaction", "fullness", "warmth", "fear", "relaxation",
"growth", "interest", "nature", "protection", "digestion", "injury",
"decoration", "construction", "landscape", "noise", "sound", "energy",
"nourishment", "nutrition", "pollination", "sustainability", "tradition",
"biodiversity", "symbolism", "elegance", "resilience", "patience",
"beauty", "abundance", "fertility", "creativity", "harmony", "comfort",
"curiosity", "companionship", "loyalty", "aggression", "alertness",
"camouflage", "predation", "migration", "hibernation", "decomposition",
"erosion", "combustion", "fermentation", "oxidation", "corrosion",
"photosynthesis", "respiration", "evaporation", "precipitation",
"transpiration", "germination", "excitement", "enjoyment", "satiety",
"stability", "organization", "fragrance", "moisture", "wildlife",
"preservation", "conversation", "inspiration", "storage", "observation",
"hydration", "destruction", "entertainment", "education", "knowledge",
"safety", "practice", "research", "skill", "space", "license",
"collection", "habitat", "pollution", "health", "vibration", "wonder",
"awe", "refreshment", "irritation", "happiness", "joy", "damage",
"death", "pain", "thirst", "fear", "alarm", "contents", "ingredients",
"electricity", "oxygen", "navigation", "recreation", "meditation",
"nutrition", "celebration", "communication", "imagination", "devotion",
"ambition", "endurance", "independence", "discipline", "cooperation",
"sweetness", "fullness", "aroma", "flavor", "fragrance", "texture",
"smell", "color", "contents", "surface", "bottom", "edge",
"nutrients", "study", "outfit", "upholstery",
}
# Scientific/technical — not folksy enough for folk wisdom
EXCLUDE_TECHNICAL = {
"cellulose", "exoskeleton", "protein", "tissue", "cells", "alloy",
"cellulose", "enzyme", "chlorophyll", "genome", "photon",
"organism", "molecule", "compound", "polymer", "isotope",
"ecosystem", "metabolism", "catalyst", "membrane", "chromosome",
"cell", "nutrient", "ingredient", "material", "content",
}
# Collective/institutional nouns — not concrete individual things
EXCLUDE_INSTITUTIONAL = {
"orchestra", "fleet", "arsenal", "toolkit", "collection",
"restaurant", "museum", "university", "corporation", "organization",
"musician", "breakfast", "dinner", "meal", "dish", "sandwich",
"seafood", "refrigerator", "garage", "basement", "park",
}
# Adjectives and properties — useful as HasProperty targets but not as vocab words
EXCLUDE_ADJECTIVES = {
"small", "large", "heavy", "colorful", "green", "brown", "hard",
"white", "round", "sharp", "sturdy", "long", "soft", "flat",
"sweet", "bitter", "smooth", "rough", "bright", "dark", "dry",
"wet", "thick", "thin", "warm", "cold", "hot", "tall", "short",
"red", "blue", "yellow", "black", "grey", "gray", "pink",
"fragrant", "loud", "spicy", "sour", "tough", "delicate", "strong",
"weak", "light", "dense", "portable", "lightweight", "transparent",
"opaque", "flexible", "rigid", "brittle", "elastic", "porous",
"compact", "edible", "toxic", "aromatic", "nocturnal", "aquatic",
"durable", "cylindrical", "wooden", "shiny", "solid", "narrow",
"metallic", "pungent", "juicy", "fast", "powerful", "woody",
"fibrous", "savory", "liquid", "enclosed", "rectangular", "wild",
"feathered", "leafy", "crunchy", "dangerous", "fuzzy", "slimy",
"natural", "waterproof", "electronic",
}
# Words that are clearly verbs or gerunds
EXCLUDE_VERBS = {
"eating", "cooking", "growing", "fishing", "hunting", "flying",
"mining", "flavoring", "singing", "blooming", "holding", "baking",
"ripening", "opening", "cutting", "protecting", "seasoning",
"storing", "building", "swimming", "brewing", "weaving", "carving",
"climbing", "digging", "plowing", "sewing", "spinning", "tanning",
"swim", "run", "grow", "eat", "hunt", "peck", "bite", "dive",
"crawl", "cut", "shine", "sparkle",
}
def singularize(word):
"""Best-effort singularization. Returns (singular, was_plural)."""
# Irregular plurals
irregulars = {
"teeth": "tooth", "feet": "foot", "geese": "goose", "mice": "mouse",
"lice": "louse", "dice": "die", "oxen": "ox", "children": "child",
"leaves": "leaf", "loaves": "loaf", "halves": "half", "knives": "knife",
"lives": "life", "wives": "wife", "wolves": "wolf", "shelves": "shelf",
"calves": "calf",
}
if word in irregulars:
return irregulars[word], True
# -ves -> -f (already covered some above, catch remaining)
if word.endswith("ves"):
candidate = word[:-3] + "f"
return candidate, True
# -ies -> -y
if word.endswith("ies") and len(word) > 4:
return word[:-3] + "y", True
# -ses, -xes, -zes, -ches, -shes -> drop -es
if word.endswith(("ses", "xes", "zes", "ches", "shes")):
return word[:-2], True
# -s (but not -ss, -us, -is)
if word.endswith("s") and not word.endswith(("ss", "us", "is")):
return word[:-1], True
return word, False
def is_plural_of_existing(word, existing_vocab):
"""Check if word is likely a plural form of an existing vocab word."""
# word + s
if word.endswith("s") and word[:-1] in existing_vocab:
return True
# word + es
if word.endswith("es") and word[:-2] in existing_vocab:
return True
# word ending ies -> y
if word.endswith("ies") and word[:-3] + "y" in existing_vocab:
return True
# word ending ves -> f/fe
if word.endswith("ves"):
if word[:-3] + "f" in existing_vocab:
return True
if word[:-3] + "fe" in existing_vocab:
return True
return False
def is_plural_of_candidate(word, accepted_words):
"""Check if word is a plural of another candidate, or vice versa."""
# Is this word a plural of something accepted?
if word.endswith("s") and word[:-1] in accepted_words:
return True
if word.endswith("es") and word[:-2] in accepted_words:
return True
if word.endswith("ies") and word[:-3] + "y" in accepted_words:
return True
# Is something accepted a plural of this word?
if word + "s" in accepted_words:
return True
if word + "es" in accepted_words:
return True
if word.endswith("f") and word[:-1] + "ves" in accepted_words:
return True
if word.endswith("fe") and word[:-2] + "ves" in accepted_words:
return True
return False
# ---------------------------------------------------------------------------
# LLM categorization
# ---------------------------------------------------------------------------
CATEGORIZE_SYSTEM = """You are a vocabulary categorizer. Given a list of concrete nouns, assign each one to one or more categories from this fixed list:
animal, beverage, bird, building, clothing, container, crop, fabric, fish, flower, food, fruit, furniture, grain, herb, insect, instrument, landscape, material, metal, mineral, organism, plant, rock, seed, shelter, spice, stone, structure, tool, tree, vegetable, vehicle, water, weapon, wood
Rules:
- Use ONLY categories from the list above
- A word can have multiple categories (e.g., "brick" -> material, stone)
- If a word fits none of the categories well, output SKIP
- Output format: word: category1, category2
- One word per line"""
CATEGORIZE_USER = """Categorize these words:
{word_list}"""
def llm_chat_completion(messages, max_retries=3):
"""Chat completion with retry logic."""
import requests
for attempt in range(max_retries):
try:
resp = requests.post(LLM_ENDPOINT, json={
"model": LLM_MODEL,
"messages": messages,
}, timeout=120)
resp.raise_for_status()
data = resp.json()
return data["choices"][0]["message"]["content"]
except Exception as e:
wait = (2 ** attempt)
print(f" LLM call failed (attempt {attempt+1}/{max_retries}): {e}",
file=sys.stderr)
if attempt < max_retries - 1:
print(f" Retrying in {wait}s...", file=sys.stderr)
time.sleep(wait)
else:
print(f" Giving up on this batch.", file=sys.stderr)
return None
def parse_categories(response_text, valid_words):
"""Parse LLM categorization response."""
result = {}
if not response_text:
return result
for line in response_text.strip().split("\n"):
line = line.strip()
if not line:
continue
# Strip bullets/numbers
line = re.sub(r"^[\d]+[.)]\s*", "", line)
line = re.sub(r"^[-*•]\s*", "", line)
line = line.strip()
# Match: word: cat1, cat2
match = re.match(r"^(\w+)\s*:\s*(.+)$", line)
if not match:
continue
word = match.group(1).strip().lower()
cats_raw = match.group(2).strip()
if "SKIP" in cats_raw.upper():
continue
cats = []
for c in cats_raw.split(","):
c = c.strip().lower()
if c in VALID_CATEGORIES:
cats.append(c)
if word in valid_words and cats:
result[word] = cats
return result
def categorize_words(words, batch_size=25):
"""Categorize words using the LLM in batches."""
all_categories = {}
word_set = set(words)
for i in range(0, len(words), batch_size):
batch = words[i:i + batch_size]
word_list = "\n".join(f"- {w}" for w in batch)
messages = [
{"role": "system", "content": CATEGORIZE_SYSTEM},
{"role": "user", "content": CATEGORIZE_USER.format(word_list=word_list)},
]
response = llm_chat_completion(messages)
parsed = parse_categories(response, word_set)
all_categories.update(parsed)
categorized = len(parsed)
print(f" Batch {i // batch_size + 1}: {categorized}/{len(batch)} categorized")
time.sleep(0.1)
return all_categories
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main():
parser = argparse.ArgumentParser(
description="Expand folksy vocabulary with LLM-suggested candidates."
)
parser.add_argument("--min-citations", type=int, default=5,
help="Minimum number of vocab words that suggested this candidate (default: 5)")
parser.add_argument("--dry-run", action="store_true",
help="Show what would be added without modifying files")
parser.add_argument("--no-llm", action="store_true",
help="Skip LLM categorization (use placeholder categories)")
args = parser.parse_args()
# Load existing vocab
existing_vocab = {}
with open(VOCAB_CSV, newline="", encoding="utf-8") as f:
for row in csv.DictReader(f):
existing_vocab[row["word"]] = row
existing_words = set(existing_vocab.keys())
print(f"Existing vocabulary: {len(existing_words)} words")
# Load candidates
candidates = []
with open(CANDIDATE_CSV, newline="", encoding="utf-8") as f:
for row in csv.DictReader(f):
candidates.append(row)
# Aggregate: count unique sources per candidate word
word_sources = defaultdict(set)
for c in candidates:
word_sources[c["word"]].add(c["suggested_by"])
print(f"Total candidate rows: {len(candidates)}")
print(f"Unique candidate words: {len(word_sources)}")
# Normalize plurals: merge citation counts into singular forms
normalized_sources = defaultdict(set)
for word, sources in word_sources.items():
singular, was_plural = singularize(word)
# Merge into the singular form
normalized_sources[singular].update(sources)
# Replace word_sources with normalized version
word_sources = {w: srcs for w, srcs in normalized_sources.items()}
print(f"After singularization: {len(word_sources)} unique candidates")
# Filter
accepted = []
reject_reasons = Counter()
# Sort by citation count descending for consistent ordering
sorted_candidates = sorted(word_sources.items(), key=lambda x: len(x[1]), reverse=True)
accepted_set = set()
for word, sources in sorted_candidates:
citation_count = len(sources)
# Minimum citation threshold
if citation_count < args.min_citations:
reject_reasons["below_threshold"] += 1
continue
# No multi-word (underscore) candidates
if "_" in word:
reject_reasons["multi_word"] += 1
continue
# Already in vocab
if word in existing_words:
reject_reasons["already_in_vocab"] += 1
continue
# Exclude abstracts
if word in EXCLUDE_ABSTRACT:
reject_reasons["abstract"] += 1
continue
# Exclude adjectives
if word in EXCLUDE_ADJECTIVES:
reject_reasons["adjective"] += 1
continue
# Exclude verbs/gerunds
if word in EXCLUDE_VERBS:
reject_reasons["verb_gerund"] += 1
continue
# Exclude technical/scientific
if word in EXCLUDE_TECHNICAL:
reject_reasons["technical"] += 1
continue
# Exclude institutional/collective
if word in EXCLUDE_INSTITUTIONAL:
reject_reasons["institutional"] += 1
continue
# Gerund pattern catch-all (but allow exceptions)
if word.endswith("ing") and word not in {"ring", "spring", "string", "wing", "ceiling"}:
reject_reasons["gerund_pattern"] += 1
continue
# Exclude plurals of existing vocab
if is_plural_of_existing(word, existing_words):
reject_reasons["plural_of_existing"] += 1
continue
# Exclude plurals of already-accepted candidates
if is_plural_of_candidate(word, accepted_set):
reject_reasons["plural_of_candidate"] += 1
continue
# Single character
if len(word) < 2:
reject_reasons["too_short"] += 1
continue
accepted.append((word, citation_count))
accepted_set.add(word)
print(f"\nFiltering results:")
print(f" Accepted: {len(accepted)}")
for reason, count in reject_reasons.most_common():
print(f" Rejected ({reason}): {count}")
if not accepted:
print("\nNo candidates passed filtering.")
return
# Show accepted words
print(f"\nAccepted candidates ({len(accepted)}):")
for word, count in accepted:
print(f" {word:25s} cited by {count:3d} vocab words")
if args.dry_run:
print(f"\nDry run complete. Would add {len(accepted)} words to vocabulary.")
return
# Categorize with LLM
words_to_categorize = [w for w, _ in accepted]
if args.no_llm:
print("\nSkipping LLM categorization (--no-llm). Using 'material' as placeholder.")
categories = {w: ["material"] for w in words_to_categorize}
else:
print(f"\nCategorizing {len(words_to_categorize)} words with LLM...")
categories = categorize_words(words_to_categorize)
# Words the LLM couldn't categorize get skipped
uncategorized = [w for w in words_to_categorize if w not in categories]
if uncategorized:
print(f"\n {len(uncategorized)} words could not be categorized (skipped):")
for w in uncategorized:
print(f" {w}")
# Build new vocab entries
new_entries = []
for word, citation_count in accepted:
if word not in categories:
continue
cats = categories[word]
new_entries.append({
"word": word,
"categories": ",".join(cats),
"tangibility_score": "0.80",
"conceptnet_edge_count": "0",
"frequency_rank": "0",
})
if not new_entries:
print("\nNo entries to add after categorization.")
return
# Backup existing vocab
backup_path = VOCAB_CSV.with_suffix(f".csv.bak.{datetime.now().strftime('%Y%m%d_%H%M%S')}")
shutil.copy2(VOCAB_CSV, backup_path)
print(f"\nBacked up vocabulary to {backup_path.name}")
# Append to vocab CSV
with open(VOCAB_CSV, "a", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=["word", "categories", "tangibility_score",
"conceptnet_edge_count", "frequency_rank"])
for entry in new_entries:
writer.writerow(entry)
print(f"\nAdded {len(new_entries)} words to {VOCAB_CSV.name}")
print(f"New vocabulary size: {len(existing_words) + len(new_entries)}")
# Summary by category
cat_counts = Counter()
for entry in new_entries:
for c in entry["categories"].split(","):
cat_counts[c.strip()] += 1
print(f"\nNew words by category:")
for cat, count in cat_counts.most_common():
print(f" {cat:20s} {count:3d}")
print(f"\nNext step: run 'python scripts/enhance_graph.py --phase 1' to generate edges for new words.")
if __name__ == "__main__":
main()

177
scripts/filter_corpus.py Normal file
View file

@ -0,0 +1,177 @@
#!/usr/bin/env python3
"""Quality filtering for polished folksy sayings.
Reads corpus_polished.jsonl, applies quality filters, outputs filtered corpus
and discard analysis.
Usage:
python scripts/filter_corpus.py
python scripts/filter_corpus.py --input corpus/corpus_polished.jsonl --output corpus/corpus_filtered.jsonl
"""
import argparse
import csv
import json
import sys
from difflib import SequenceMatcher
from pathlib import Path
SCRIPT_DIR = Path(__file__).parent
PROJECT_DIR = SCRIPT_DIR.parent
CORPUS_DIR = PROJECT_DIR / "corpus"
def quality_filter(entry):
"""Apply quality filters to a polished entry.
Returns (passed, reason) tuple.
"""
text = entry.get("polished_text", "")
if not text:
return False, "no_polished_text"
words = text.split()
# Length check
if len(words) > 25:
return False, "too_long"
if len(words) < 5:
return False, "too_short"
# Must contain at least 2 of the original slot-fill nouns
slot_words = set(entry.get("slots", {}).values())
words_present = sum(1 for w in slot_words if w.lower() in text.lower())
if words_present < 2:
return False, "lost_key_nouns"
# No raw ConceptNet artifacts (multi-word underscore phrases)
if "_" in text:
return False, "conceptnet_artifact"
# No broken templates (unfilled slots)
if "{" in text or "}" in text:
return False, "unfilled_slot"
return True, "pass"
def is_near_duplicate(text_a, text_b, threshold=0.75):
"""Check if two texts are near-duplicates."""
return SequenceMatcher(None, text_a.lower(), text_b.lower()).ratio() > threshold
def deduplicate_within_family(entries):
"""Remove near-duplicates within each meta-template family.
Returns (kept, removed) lists.
"""
by_family = {}
for entry in entries:
family = entry.get("meta_template", "unknown")
by_family.setdefault(family, []).append(entry)
kept = []
removed = []
for family, family_entries in by_family.items():
family_kept = []
for entry in family_entries:
text = entry.get("polished_text", "")
is_dup = False
for existing in family_kept:
if is_near_duplicate(text, existing.get("polished_text", "")):
is_dup = True
break
if is_dup:
removed.append((entry, "near_duplicate"))
else:
family_kept.append(entry)
kept.extend(family_kept)
return kept, removed
def main():
parser = argparse.ArgumentParser(description="Quality filtering for polished folksy sayings.")
parser.add_argument("--input", default=str(CORPUS_DIR / "corpus_polished.jsonl"),
help="Input polished JSONL file")
parser.add_argument("--output", default=str(CORPUS_DIR / "corpus_filtered.jsonl"),
help="Output filtered JSONL file")
parser.add_argument("--discard-analysis", default=str(CORPUS_DIR / "discard_analysis.csv"),
help="Discard analysis CSV file")
args = parser.parse_args()
input_path = Path(args.input)
output_path = Path(args.output)
discard_path = Path(args.discard_analysis)
if not input_path.exists():
print(f"Error: {input_path} not found.", file=sys.stderr)
sys.exit(1)
# Load polished entries (only those with status=polished)
all_entries = []
already_discarded = 0
with open(input_path, encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
entry = json.loads(line)
if entry.get("status") == "polished":
all_entries.append(entry)
elif entry.get("status") == "discarded":
already_discarded += 1
print(f"Loaded {len(all_entries)} polished entries ({already_discarded} already discarded by LLM)")
# Apply quality filters
passed = []
discards = [] # (entry, reason)
for entry in all_entries:
ok, reason = quality_filter(entry)
if ok:
passed.append(entry)
else:
discards.append((entry, reason))
print(f"Quality filter: {len(passed)} passed, {len(discards)} discarded")
# Show discard breakdown
from collections import Counter
reason_counts = Counter(r for _, r in discards)
for reason, count in reason_counts.most_common():
print(f" {reason}: {count}")
# Near-duplicate detection within template families
kept, dup_removed = deduplicate_within_family(passed)
discards.extend(dup_removed)
print(f"Near-duplicate removal: {len(dup_removed)} removed, {len(kept)} remaining")
# Write filtered output
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, "w", encoding="utf-8") as f:
for entry in kept:
f.write(json.dumps(entry, ensure_ascii=False) + "\n")
print(f"\nFiltered corpus: {len(kept)} entries -> {output_path}")
# Write discard analysis
with open(discard_path, "w", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(["raw_text", "meta_template", "discard_stage", "discard_reason"])
for entry, reason in discards:
writer.writerow([
entry.get("raw_text", ""),
entry.get("meta_template", ""),
"llm_polish" if reason == "no_polished_text" else "quality_filter",
reason,
])
print(f"Discard analysis: {len(discards)} entries -> {discard_path}")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,385 @@
#!/usr/bin/env python3
"""Format filtered sayings into training pairs for fine-tuning.
Each polished saying generates 3-5 training pairs with different input framings.
Also generates fictional entity training pairs.
Usage:
python scripts/format_training_pairs.py
python scripts/format_training_pairs.py --input corpus/corpus_filtered.jsonl --output corpus/training_pairs.jsonl
"""
import argparse
import csv
import json
import random
import sys
from pathlib import Path
SCRIPT_DIR = Path(__file__).parent
PROJECT_DIR = SCRIPT_DIR.parent
CORPUS_DIR = PROJECT_DIR / "corpus"
DATA_DIR = PROJECT_DIR / "data"
EXAMPLES_DIR = PROJECT_DIR / "examples"
# Template name mappings for human-readable prompts
TEMPLATE_NAMES = {
"deconstruction": "deconstruction",
"denial_of_consequences": "denial of consequences",
"ironic_deficiency": "ironic deficiency",
"futile_preparation": "futile preparation",
"hypocritical_complaint": "hypocritical complaint",
"tautological_wisdom": "tautological wisdom",
"false_equivalence": "false equivalence",
}
PERSONAS = ["farmer", "grandmother", "old sailor", "blacksmith", "innkeeper", "shepherd"]
OPEN_ENDED_PROMPTS = [
"Tell me some folk wisdom.",
"What do they say?",
"Give me a proverb.",
"Share some old-time wisdom.",
"What's a good saying?",
]
# Auto-generated fictional entities for additional training pairs
AUTO_ENTITIES = [
{
"name": "Stoneclaw",
"categories": ["animal", "predator"],
"properties": ["fierce", "rocky", "nocturnal"],
"relations": {"AtLocation": ["cave", "mountain"], "HasA": ["claws", "scales"], "CapableOf": ["hunting", "climbing"]},
},
{
"name": "Duskmelon",
"categories": ["fruit", "food"],
"properties": ["purple", "sweet", "fragrant"],
"relations": {"AtLocation": ["garden", "market"], "UsedFor": ["eating", "jam"], "MadeOf": ["seed", "juice"]},
},
{
"name": "Windloom",
"categories": ["tool", "craft"],
"properties": ["wooden", "portable", "intricate"],
"relations": {"UsedFor": ["weaving", "thread"], "MadeOf": ["wood", "string"], "AtLocation": ["workshop", "cottage"]},
},
{
"name": "Briarvine",
"categories": ["plant", "herb"],
"properties": ["thorny", "green", "medicinal"],
"relations": {"AtLocation": ["forest", "hedge"], "UsedFor": ["healing", "tea"], "HasA": ["thorn", "leaf"]},
},
{
"name": "Mudhog",
"categories": ["animal", "livestock"],
"properties": ["muddy", "stubborn", "heavy"],
"relations": {"AtLocation": ["farm", "swamp"], "Desires": ["food", "mud"], "CapableOf": ["digging", "rooting"]},
},
{
"name": "Frostberry",
"categories": ["fruit", "food"],
"properties": ["cold", "blue", "tiny"],
"relations": {"AtLocation": ["mountain", "tundra"], "UsedFor": ["eating", "preserves"], "HasProperty": ["cold", "tart"]},
},
{
"name": "Lanternmoss",
"categories": ["plant", "fungus"],
"properties": ["glowing", "damp", "soft"],
"relations": {"AtLocation": ["cave", "swamp"], "UsedFor": ["light", "decoration"], "HasProperty": ["luminous", "fragile"]},
},
{
"name": "Cinderhawk",
"categories": ["bird", "animal"],
"properties": ["fiery", "fast", "red"],
"relations": {"AtLocation": ["mountain", "volcano"], "CapableOf": ["flying", "hunting"], "HasA": ["talons", "feathers"]},
},
{
"name": "Rootstone",
"categories": ["stone", "material"],
"properties": ["veined", "hard", "ancient"],
"relations": {"AtLocation": ["quarry", "riverbed"], "UsedFor": ["building", "carving"], "MadeOf": ["mineral", "root"]},
},
{
"name": "Silkwort",
"categories": ["plant", "fiber"],
"properties": ["silky", "white", "tall"],
"relations": {"AtLocation": ["field", "meadow"], "UsedFor": ["weaving", "cloth"], "HasA": ["stem", "fiber"]},
},
{
"name": "Kettlefrog",
"categories": ["animal", "amphibian"],
"properties": ["loud", "round", "green"],
"relations": {"AtLocation": ["pond", "marsh"], "CapableOf": ["jumping", "croaking"], "Desires": ["flies", "water"]},
},
{
"name": "Dustwheat",
"categories": ["crop", "grain"],
"properties": ["dry", "golden", "hardy"],
"relations": {"AtLocation": ["field", "barn"], "UsedFor": ["bread", "flour"], "HasPrerequisite": ["rain", "soil"]},
},
]
def format_entity_description(entity):
"""Format entity into a natural description string."""
name = entity["name"]
cats = entity.get("categories", [])
props = entity.get("properties", [])
rels = entity.get("relations", {})
parts = []
# Category description
if props and cats:
prop_str = ", ".join(props[:3])
cat_str = " and ".join(cats[:2])
parts.append(f"A {name} is a {prop_str} {cat_str}.")
elif cats:
parts.append(f"A {name} is a {' and '.join(cats[:2])}.")
# Location
if "AtLocation" in rels:
locs = rels["AtLocation"]
parts.append(f"It is found near {' and '.join(locs[:2])}.")
# Parts/properties
if "HasA" in rels:
has = rels["HasA"]
parts.append(f"It has {', '.join(has[:3])}.")
# Capabilities
if "CapableOf" in rels:
caps = rels["CapableOf"]
parts.append(f"It can {' and '.join(caps[:2])}.")
# Uses
if "UsedFor" in rels:
uses = rels["UsedFor"]
parts.append(f"It is used for {' and '.join(uses[:2])}.")
return " ".join(parts)
def load_vocab_categories():
"""Load vocab to get word -> categories mapping."""
word_cats = {}
vocab_path = DATA_DIR / "folksy_vocab.csv"
if vocab_path.exists():
with open(vocab_path, newline="", encoding="utf-8") as f:
for row in csv.DictReader(f):
word = row["word"]
cats = [c.strip() for c in row["categories"].split(",") if c.strip()]
word_cats[word] = cats
return word_cats
def generate_training_pairs(entry, word_cats):
"""Generate 3-5 training pairs for a single polished saying."""
polished = entry.get("polished_text", "")
slots = entry.get("slots", {})
meta_template = entry.get("meta_template", "")
# Collect source words (concrete nouns from slots)
source_words = [v for v in slots.values()
if v and not v.startswith("a ") and not v.startswith("an ") and len(v) > 1]
# Determine categories of slot words
slot_categories = set()
for word in source_words:
word_lower = word.lower().replace(" ", "_")
if word_lower in word_cats:
slot_categories.update(word_cats[word_lower])
pairs = []
base = {
"output": polished,
"meta_template": meta_template,
"source_words": source_words,
}
# 1. Word-seeded (always include)
if source_words:
word = random.choice(source_words)
pairs.append({**base, "input": f"Tell me something about {word}."})
# 2. Category-seeded (always include if we have categories)
if slot_categories:
cat = random.choice(list(slot_categories))
pairs.append({**base, "input": f"Tell me a saying about {cat}."})
# 3. Persona-seeded (always include)
persona = random.choice(PERSONAS)
if source_words:
word = random.choice(source_words)
pairs.append({**base, "input": f"What would a {persona} say about {word}?"})
# 4. Template-seeded (include ~70% of the time)
if random.random() < 0.7:
template_name = TEMPLATE_NAMES.get(meta_template, meta_template)
pairs.append({**base, "input": f"Give me a {template_name} proverb."})
# 5. Open-ended (include ~30% of the time)
if random.random() < 0.3:
prompt = random.choice(OPEN_ENDED_PROMPTS)
pairs.append({**base, "input": prompt})
return pairs
def generate_fictional_pairs(entities):
"""Generate training pairs for fictional entities.
These pairs include the entity description in the input.
"""
pairs = []
# Generate 15-25 pairs per entity
for entity in entities:
name = entity["name"]
desc = format_entity_description(entity)
props = entity.get("properties", [])
rels = entity.get("relations", {})
# Collect words related to this entity
related_words = []
for targets in rels.values():
related_words.extend(targets)
n_pairs = random.randint(15, 25)
for _ in range(n_pairs):
framing = random.choice(["persona", "word", "category", "open"])
if framing == "persona":
persona = random.choice(PERSONAS)
input_text = f"{desc} What would a {persona} say about a {name}?"
elif framing == "word" and related_words:
word = random.choice(related_words)
input_text = f"{desc} Tell me a saying about {name} and {word}."
elif framing == "category":
cats = entity.get("categories", ["thing"])
cat = random.choice(cats)
input_text = f"{desc} Give me folk wisdom about this {cat}."
else:
input_text = f"{desc} Tell me some folk wisdom about {name}."
# Placeholder output — these would ideally be generated through the
# template engine with fictional entities loaded, then polished.
# For now, generate a structural placeholder that indicates the
# entity relationships.
pairs.append({
"input": input_text,
"output": "", # Will be filled by actual generation
"meta_template": "fictional",
"source_words": [name] + related_words[:3],
"_needs_generation": True,
"_entity": entity,
})
return pairs
def main():
parser = argparse.ArgumentParser(description="Format training pairs for fine-tuning.")
parser.add_argument("--input", default=str(CORPUS_DIR / "corpus_filtered.jsonl"),
help="Input filtered JSONL file")
parser.add_argument("--output", default=str(CORPUS_DIR / "training_pairs.jsonl"),
help="Output training pairs JSONL file")
parser.add_argument("--entities", default=str(EXAMPLES_DIR / "my_world.json"),
help="Fictional entities JSON file")
args = parser.parse_args()
input_path = Path(args.input)
output_path = Path(args.output)
entities_path = Path(args.entities)
if not input_path.exists():
print(f"Error: {input_path} not found.", file=sys.stderr)
sys.exit(1)
# Load vocab categories
word_cats = load_vocab_categories()
# Load filtered entries
entries = []
with open(input_path, encoding="utf-8") as f:
for line in f:
line = line.strip()
if line:
entries.append(json.loads(line))
print(f"Loaded {len(entries)} filtered entries")
# Generate training pairs for each entry
all_pairs = []
for entry in entries:
pairs = generate_training_pairs(entry, word_cats)
all_pairs.extend(pairs)
print(f"Generated {len(all_pairs)} training pairs from polished sayings")
# Generate fictional entity pairs
fictional_entities = []
if entities_path.exists():
with open(entities_path, encoding="utf-8") as f:
data = json.load(f)
fictional_entities = data.get("entities", [])
print(f"Loaded {len(fictional_entities)} fictional entities from {entities_path}")
# Add auto-generated entities
fictional_entities.extend(AUTO_ENTITIES)
print(f"Total fictional entities (file + auto-generated): {len(fictional_entities)}")
fictional_pairs = generate_fictional_pairs(fictional_entities)
# Filter out placeholder pairs (those that still need generation)
# In a full pipeline, these would be generated through the template engine.
# For now, skip any with empty output.
real_fictional = [p for p in fictional_pairs if p.get("output")]
placeholder_fictional = [p for p in fictional_pairs if not p.get("output")]
if placeholder_fictional:
print(f" {len(placeholder_fictional)} fictional pairs need generation via template engine")
print(f" (Run folksy_generator.py with --entities to generate these, then re-run this script)")
all_pairs.extend(real_fictional)
# Clean up internal fields before writing
for pair in all_pairs:
pair.pop("_needs_generation", None)
pair.pop("_entity", None)
# Write output
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, "w", encoding="utf-8") as f:
for pair in all_pairs:
f.write(json.dumps(pair, ensure_ascii=False) + "\n")
# Stats
from collections import Counter
input_types = Counter()
for pair in all_pairs:
inp = pair["input"]
if inp.startswith("Tell me something about"):
input_types["word_seeded"] += 1
elif inp.startswith("Tell me a saying about"):
input_types["category_seeded"] += 1
elif inp.startswith("What would a"):
input_types["persona_seeded"] += 1
elif inp.startswith("Give me a") and "proverb" in inp:
input_types["template_seeded"] += 1
elif any(inp.startswith(p) for p in ["Tell me some folk", "What do they", "Give me a proverb", "Share some", "What's a good"]):
input_types["open_ended"] += 1
else:
input_types["fictional"] += 1
print(f"\nTotal training pairs: {len(all_pairs)}")
print("Distribution by input type:")
for itype, count in sorted(input_types.items()):
print(f" {itype:20s} {count:5d}")
print(f"\nOutput: {output_path}")
if __name__ == "__main__":
main()

61
scripts/generate_raw_batch.sh Executable file
View file

@ -0,0 +1,61 @@
#!/usr/bin/env bash
# Generate raw folksy sayings across all 7 templates.
# Output: corpus/corpus_raw.jsonl (~10,500 entries)
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
CORPUS_DIR="$PROJECT_DIR/corpus"
GENERATOR="$PROJECT_DIR/folksy_generator.py"
COUNT_PER_TEMPLATE=${1:-1500}
mkdir -p "$CORPUS_DIR"
OUTPUT="$CORPUS_DIR/corpus_raw.jsonl"
# Clear existing file
> "$OUTPUT"
TEMPLATES=(
deconstruction
denial_of_consequences
ironic_deficiency
futile_preparation
hypocritical_complaint
tautological_wisdom
false_equivalence
)
echo "Generating $COUNT_PER_TEMPLATE sayings per template (${#TEMPLATES[@]} templates)..."
echo "Output: $OUTPUT"
total=0
for template in "${TEMPLATES[@]}"; do
echo -n " $template ($COUNT_PER_TEMPLATE)... "
before=$(wc -l < "$OUTPUT")
python "$GENERATOR" --template "$template" --count "$COUNT_PER_TEMPLATE" --json >> "$OUTPUT" 2>/dev/null
after=$(wc -l < "$OUTPUT")
generated=$((after - before))
total=$((total + generated))
echo "$generated generated"
done
echo ""
echo "Total: $total raw sayings in $OUTPUT"
echo ""
# Check template distribution
echo "Template distribution:"
python -c "
import json, sys
from collections import Counter
counts = Counter()
with open('$OUTPUT') as f:
for line in f:
entry = json.loads(line)
counts[entry['meta_template']] += 1
for template, count in sorted(counts.items()):
print(f' {template:30s} {count:5d}')
print(f\" {'TOTAL':30s} {sum(counts.values()):5d}\")
"

215
scripts/polish_corpus.py Normal file
View file

@ -0,0 +1,215 @@
#!/usr/bin/env python3
"""LLM polish pipeline for raw folksy sayings.
Reads corpus_raw.jsonl, sends each to GLM4-32B for polish.
Output file is the checkpoint append mode with resume detection.
Usage:
python scripts/polish_corpus.py
python scripts/polish_corpus.py --input corpus/corpus_raw.jsonl --output corpus/corpus_polished.jsonl
"""
import argparse
import json
import sys
import time
from pathlib import Path
SCRIPT_DIR = Path(__file__).parent
PROJECT_DIR = SCRIPT_DIR.parent
CORPUS_DIR = PROJECT_DIR / "corpus"
LLM_ENDPOINT = "http://192.168.1.100:8853/v1d/chat/completions"
LLM_MODEL = "THUDM-GLM4-32B"
SYSTEM_PROMPT = """You are an editor specializing in folk sayings and rural proverbs. You will receive a rough draft of a fake folksy saying along with the relationship chain it encodes.
Your job:
1. Fix grammar, articles, and pluralization
2. Make it sound natural like something a weathered farmer would say while leaning on a fence post
3. Preserve the core nouns and the relationship between them do not swap out the key words
4. You MAY add small colorful details (adjectives, folksy verb choices, regional flavor) but keep it concise real proverbs are short
5. You MAY lightly restructure the sentence for better rhythm, but keep the same meaning pattern
6. If the saying is unsalvageable nonsense (the nouns don't relate in any meaningful way, or the combination is unintentionally offensive), respond with exactly: DISCARD
Output ONLY the polished saying on a single line. No quotes, no explanation, no preamble.
Examples of good polish:
Raw: "Don't build the coffee and act surprised when the water show up."
Chain: coffee MadeOf water
Polished: Don't brew the coffee and act surprised when the water's all gone.
Raw: "The chest's children always goes without hold books."
Chain: chest UsedFor hold_books
Polished: The bookshelf-maker's kids always end up reading off the floor.
Raw: "A pineapple is just a nectarine that's got an attitude."
Chain: pineapple IsA fruit, nectarine IsA fruit, pineapple HasProperty prickly
Polished: A pineapple is just a peach that grew itself some armor.
Raw: "You know what they say, a steel with no iron is just a harder than gold iron."
Chain: steel MadeOf iron, steel HasProperty hard
Polished: You know what they say steel without the iron is just a dream of being hard.
Raw: "Funny how the bamboo never has enough grow very quickly for itself."
Chain: bamboo CapableOf grow_quickly
Polished: DISCARD
Raw: "That's just funning the canoe and praying for boiling food."
Chain: canoe UsedFor transport, fire UsedFor boiling_food
Polished: DISCARD"""
def llm_chat_completion(messages, max_retries=3):
"""Chat completion with retry logic."""
import requests
for attempt in range(max_retries):
try:
resp = requests.post(LLM_ENDPOINT, json={
"model": LLM_MODEL,
"messages": messages,
}, timeout=120)
resp.raise_for_status()
data = resp.json()
return data["choices"][0]["message"]["content"].strip()
except Exception as e:
wait = (2 ** attempt)
print(f" LLM error (attempt {attempt+1}/{max_retries}): {e}", file=sys.stderr)
if attempt < max_retries - 1:
time.sleep(wait)
else:
return None
def format_chain(chain_edges):
"""Format chain_edges list into readable string for LLM context."""
if not chain_edges:
return "(no chain data)"
parts = []
for edge in chain_edges:
start = edge.get("start", "?")
rel = edge.get("relation", "?")
end = edge.get("end", "?")
weight = edge.get("weight", 0)
parts.append(f"{start} --{rel}--> {end} (w:{weight:.1f})")
return ", ".join(parts)
def format_slots(slots):
"""Format slots dict for LLM context."""
return ", ".join(f"{k}={v}" for k, v in slots.items())
def load_already_processed(output_path):
"""Load set of raw_text strings already processed (for resume)."""
processed = set()
if output_path.exists():
with open(output_path, encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
try:
entry = json.loads(line)
processed.add(entry.get("raw_text", ""))
except json.JSONDecodeError:
continue
return processed
def main():
parser = argparse.ArgumentParser(description="LLM polish pipeline for folksy sayings.")
parser.add_argument("--input", default=str(CORPUS_DIR / "corpus_raw.jsonl"),
help="Input JSONL file")
parser.add_argument("--output", default=str(CORPUS_DIR / "corpus_polished.jsonl"),
help="Output JSONL file (also serves as checkpoint)")
args = parser.parse_args()
input_path = Path(args.input)
output_path = Path(args.output)
if not input_path.exists():
print(f"Error: {input_path} not found.", file=sys.stderr)
sys.exit(1)
# Load raw entries
raw_entries = []
with open(input_path, encoding="utf-8") as f:
for line in f:
line = line.strip()
if line:
raw_entries.append(json.loads(line))
print(f"Loaded {len(raw_entries)} raw entries from {input_path}")
# Check what's already been processed
already_processed = load_already_processed(output_path)
remaining = [e for e in raw_entries if e.get("raw_text", "") not in already_processed]
print(f"Already processed: {len(already_processed)}")
print(f"Remaining: {len(remaining)}")
if not remaining:
print("Nothing to process.")
return
discards = 0
polished = 0
errors = 0
with open(output_path, "a", encoding="utf-8") as out:
for i, entry in enumerate(remaining):
raw_text = entry.get("raw_text", "")
meta_template = entry.get("meta_template", "")
chain = format_chain(entry.get("chain", []))
slots = format_slots(entry.get("slots", {}))
user_prompt = (
f"Meta-template: {meta_template}\n"
f"Relationship chain: {chain}\n"
f"Slot fills: {slots}\n"
f"Raw saying: {raw_text}"
)
messages = [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": user_prompt},
]
response = llm_chat_completion(messages)
if response is None:
entry["status"] = "error"
errors += 1
elif response.strip().upper() == "DISCARD":
entry["status"] = "discarded"
discards += 1
else:
entry["polished_text"] = response.strip()
entry["status"] = "polished"
polished += 1
out.write(json.dumps(entry, ensure_ascii=False) + "\n")
if (i + 1) % 100 == 0:
out.flush()
total_done = len(already_processed) + i + 1
print(f" [{total_done}/{len(raw_entries)}] "
f"polished={polished}, discarded={discards}, errors={errors}")
time.sleep(0.1)
total_done = len(already_processed) + len(remaining)
print(f"\nDone: {total_done} total entries processed.")
print(f" Polished: {polished}")
print(f" Discarded: {discards}")
print(f" Errors: {errors}")
print(f" Discard rate: {discards/(polished+discards)*100:.1f}%" if (polished+discards) else " N/A")
print(f"Output: {output_path}")
if __name__ == "__main__":
main()