corpus generation (work from mid february)

2026-03-09 19:52:09 -04:00 · 2026-03-09 19:52:09 -04:00 · 356b62c6ea
commit 356b62c6ea
parent 8c8a058301
16 changed files with 25872 additions and 38 deletions
--- a/scripts/compute_corpus_stats.py
+++ b/scripts/compute_corpus_stats.py
@ -0,0 +1,213 @@
+#!/usr/bin/env python3
+"""Compute corpus statistics and validation metrics.
+
+Reads corpus files and computes counts, distributions, coverage, and balance warnings.
+
+Usage:
+  python scripts/compute_corpus_stats.py
+  python scripts/compute_corpus_stats.py --corpus-dir corpus/
+"""
+
+import argparse
+import csv
+import json
+import sys
+from collections import Counter
+from pathlib import Path
+
+SCRIPT_DIR = Path(__file__).parent
+PROJECT_DIR = SCRIPT_DIR.parent
+DATA_DIR = PROJECT_DIR / "data"
+
+
+def load_jsonl(path):
+    """Load a JSONL file."""
+    entries = []
+    if not path.exists():
+        return entries
+    with open(path, encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if line:
+                entries.append(json.loads(line))
+    return entries
+
+
+def classify_input_type(inp):
+    """Classify the input framing type of a training pair."""
+    if inp.startswith("Tell me something about"):
+        return "word_seeded"
+    elif inp.startswith("Tell me a saying about"):
+        return "category_seeded"
+    elif inp.startswith("What would a"):
+        return "persona_seeded"
+    elif inp.startswith("Give me a") and "proverb" in inp:
+        return "template_seeded"
+    elif any(inp.startswith(p) for p in [
+        "Tell me some folk", "What do they", "Give me a proverb",
+        "Share some", "What's a good"
+    ]):
+        return "open_ended"
+    else:
+        return "fictional"
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Compute corpus statistics.")
+    parser.add_argument("--corpus-dir", default=str(PROJECT_DIR / "corpus"),
+                        help="Corpus directory")
+    parser.add_argument("--output", default=None,
+                        help="Output JSON file (default: corpus_dir/corpus_stats.json)")
+    args = parser.parse_args()
+
+    corpus_dir = Path(args.corpus_dir)
+    output_path = Path(args.output) if args.output else corpus_dir / "corpus_stats.json"
+
+    # Load all corpus files
+    raw = load_jsonl(corpus_dir / "corpus_raw.jsonl")
+    polished = load_jsonl(corpus_dir / "corpus_polished.jsonl")
+    filtered = load_jsonl(corpus_dir / "corpus_filtered.jsonl")
+    training = load_jsonl(corpus_dir / "training_pairs.jsonl")
+
+    # Load vocab for coverage analysis
+    vocab_words = set()
+    vocab_path = DATA_DIR / "folksy_vocab.csv"
+    if vocab_path.exists():
+        with open(vocab_path, newline="", encoding="utf-8") as f:
+            for row in csv.DictReader(f):
+                vocab_words.add(row["word"])
+
+    stats = {}
+
+    # --- Raw corpus stats ---
+    stats["raw_count"] = len(raw)
+    raw_by_template = Counter(e.get("meta_template", "unknown") for e in raw)
+    stats["raw_by_template"] = dict(sorted(raw_by_template.items()))
+
+    # --- Polish stats ---
+    polished_entries = [e for e in polished if e.get("status") == "polished"]
+    discarded_entries = [e for e in polished if e.get("status") == "discarded"]
+    error_entries = [e for e in polished if e.get("status") == "error"]
+
+    stats["polished_count"] = len(polished_entries)
+    stats["discarded_during_polish"] = len(discarded_entries)
+    stats["errors_during_polish"] = len(error_entries)
+    if polished_entries or discarded_entries:
+        total_processed = len(polished_entries) + len(discarded_entries)
+        stats["polish_discard_rate"] = f"{len(discarded_entries)/total_processed*100:.1f}%"
+
+    polish_by_template = Counter(e.get("meta_template", "unknown") for e in polished_entries)
+    stats["polished_by_template"] = dict(sorted(polish_by_template.items()))
+
+    discard_by_template = Counter(e.get("meta_template", "unknown") for e in discarded_entries)
+    stats["discarded_by_template"] = dict(sorted(discard_by_template.items()))
+
+    # --- Filter stats ---
+    stats["filtered_count"] = len(filtered)
+
+    filter_by_template = Counter(e.get("meta_template", "unknown") for e in filtered)
+    stats["filtered_by_template"] = dict(sorted(filter_by_template.items()))
+
+    # Filter discard count
+    stats["discarded_during_filter"] = len(polished_entries) - len(filtered)
+
+    # --- Training pairs stats ---
+    stats["training_pair_count"] = len(training)
+
+    training_by_template = Counter(e.get("meta_template", "unknown") for e in training)
+    stats["training_by_template"] = dict(sorted(training_by_template.items()))
+
+    input_type_counts = Counter(classify_input_type(e.get("input", "")) for e in training)
+    stats["training_by_input_type"] = dict(sorted(input_type_counts.items()))
+
+    # --- Coverage analysis ---
+    used_words = set()
+    for entry in filtered:
+        slots = entry.get("slots", {})
+        for v in slots.values():
+            word = v.lower().replace(" ", "_")
+            if word in vocab_words:
+                used_words.add(word)
+
+    stats["unique_slot_words_used"] = len(used_words)
+    stats["total_vocab_words"] = len(vocab_words)
+    stats["vocab_coverage"] = f"{len(used_words)/len(vocab_words)*100:.1f}%" if vocab_words else "N/A"
+
+    never_used = sorted(vocab_words - used_words)
+    stats["words_never_used"] = never_used
+    stats["words_never_used_count"] = len(never_used)
+
+    # --- Saying length stats ---
+    lengths = []
+    for entry in filtered:
+        text = entry.get("polished_text", "")
+        if text:
+            lengths.append(len(text.split()))
+
+    if lengths:
+        stats["avg_saying_length_words"] = round(sum(lengths) / len(lengths), 1)
+        stats["min_saying_length_words"] = min(lengths)
+        stats["max_saying_length_words"] = max(lengths)
+
+    # --- Balance warnings ---
+    warnings = []
+    if filtered:
+        total_filtered = len(filtered)
+        for template, count in filter_by_template.items():
+            pct = count / total_filtered * 100
+            if pct < 10:
+                warnings.append(
+                    f"WARNING: {template} has only {count} entries ({pct:.1f}%) — "
+                    f"below 10% threshold. Generate more raw sayings for this family."
+                )
+
+    if training:
+        total_training = len(training)
+        for template, count in training_by_template.items():
+            pct = count / total_training * 100
+            if pct < 5:
+                warnings.append(
+                    f"WARNING: {template} has only {count} training pairs ({pct:.1f}%) — very underrepresented."
+                )
+
+    stats["balance_warnings"] = warnings
+
+    # --- Write output ---
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(output_path, "w", encoding="utf-8") as f:
+        json.dump(stats, f, indent=2, ensure_ascii=False)
+
+    # --- Print summary ---
+    print("=" * 60)
+    print("CORPUS STATISTICS")
+    print("=" * 60)
+
+    print(f"\nRaw sayings:           {stats['raw_count']}")
+    print(f"Polished sayings:      {stats['polished_count']}")
+    print(f"Discarded (polish):    {stats.get('discarded_during_polish', 0)} ({stats.get('polish_discard_rate', 'N/A')})")
+    print(f"Discarded (filter):    {stats.get('discarded_during_filter', 0)}")
+    print(f"Final filtered:        {stats['filtered_count']}")
+    print(f"Training pairs:        {stats['training_pair_count']}")
+
+    print(f"\nDistribution by meta-template (filtered):")
+    for t, c in sorted(filter_by_template.items()):
+        pct = c / len(filtered) * 100 if filtered else 0
+        print(f"  {t:30s} {c:5d} ({pct:5.1f}%)")
+
+    print(f"\nDistribution by input framing type:")
+    for t, c in sorted(input_type_counts.items()):
+        print(f"  {t:20s} {c:5d}")
+
+    print(f"\nVocab coverage: {stats['vocab_coverage']} ({stats['unique_slot_words_used']}/{stats['total_vocab_words']})")
+    print(f"Average saying length: {stats.get('avg_saying_length_words', 'N/A')} words")
+
+    if warnings:
+        print(f"\nBalance warnings:")
+        for w in warnings:
+            print(f"  {w}")
+
+    print(f"\nFull stats: {output_path}")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/enhance_graph.py
+++ b/scripts/enhance_graph.py
@ -0,0 +1,787 @@
+#!/usr/bin/env python3
+"""LLM-augmented graph enhancement for the folksy subgraph.
+
+Three phases:
+  Phase 1: Per-word relationship expansion
+  Phase 2: Cross-word bridge discovery
+  Phase 3: Property enrichment for false_equivalence templates
+
+Usage:
+  python scripts/enhance_graph.py --phase 1        # Run phase 1 only
+  python scripts/enhance_graph.py --phase 2        # Run phase 2 only
+  python scripts/enhance_graph.py --phase 3        # Run phase 3 only
+  python scripts/enhance_graph.py --all             # Run all phases
+  python scripts/enhance_graph.py --phase 1 --dry-run  # Print prompts without calling LLM
+"""
+
+import argparse
+import csv
+import os
+import random
+import re
+import sys
+import time
+from collections import defaultdict
+from datetime import datetime
+from pathlib import Path
+
+# Paths
+SCRIPT_DIR = Path(__file__).parent
+PROJECT_DIR = SCRIPT_DIR.parent
+DATA_DIR = PROJECT_DIR / "data"
+
+LLM_ENDPOINT = "http://192.168.1.100:8853/v1d/chat/completions"
+LLM_MODEL = "THUDM-GLM4-32B"
+
+VALID_RELATIONS = {
+    "AtLocation", "MadeOf", "PartOf", "UsedFor", "HasA", "HasProperty",
+    "Causes", "HasPrerequisite", "CapableOf", "ReceivesAction", "Desires",
+    "CausesDesire", "LocatedNear", "CreatedBy", "MotivatedByGoal", "HasSubevent",
+}
+
+AUGMENTED_CSV = DATA_DIR / "folksy_relations_augmented.csv"
+CANDIDATE_CSV = DATA_DIR / "candidate_additions.csv"
+LOG_CSV = DATA_DIR / "enhancement_log.csv"
+
+# ---------------------------------------------------------------------------
+# Infrastructure
+# ---------------------------------------------------------------------------
+
+def llm_chat_completion(messages, max_retries=3):
+    """Chat completion with retry logic."""
+    import requests
+
+    for attempt in range(max_retries):
+        try:
+            resp = requests.post(LLM_ENDPOINT, json={
+                "model": LLM_MODEL,
+                "messages": messages,
+            }, timeout=120)
+            resp.raise_for_status()
+            data = resp.json()
+            return data["choices"][0]["message"]["content"]
+        except Exception as e:
+            wait = (2 ** attempt)
+            print(f"  LLM call failed (attempt {attempt+1}/{max_retries}): {e}", file=sys.stderr)
+            if attempt < max_retries - 1:
+                print(f"  Retrying in {wait}s...", file=sys.stderr)
+                time.sleep(wait)
+            else:
+                print(f"  Giving up on this word.", file=sys.stderr)
+                return None
+
+
+def load_vocab():
+    """Load folksy vocabulary."""
+    vocab = {}
+    with open(DATA_DIR / "folksy_vocab.csv", newline="", encoding="utf-8") as f:
+        for row in csv.DictReader(f):
+            word = row["word"]
+            cats = [c.strip() for c in row["categories"].split(",") if c.strip()]
+            vocab[word] = {
+                "categories": cats,
+                "tangibility": float(row.get("tangibility_score", 0)),
+                "edge_count": int(row.get("conceptnet_edge_count", 0)),
+            }
+    return vocab
+
+
+def load_relations():
+    """Load existing relations (ConceptNet + any existing augmented)."""
+    edges = defaultdict(list)  # (start, relation) -> [(end, weight, surface)]
+    existing_triples = set()   # (start, end, relation) for dedup
+
+    for path in [DATA_DIR / "folksy_relations.csv", AUGMENTED_CSV]:
+        if not path.exists():
+            continue
+        with open(path, newline="", encoding="utf-8") as f:
+            for row in csv.DictReader(f):
+                sw = row["start_word"]
+                ew = row["end_word"]
+                rel = row["relation"]
+                if not row['weight']: continue # corruption / skip?
+                w = float(row["weight"])
+                surf = row.get("surface_text", "")
+                edges[(sw, rel)].append((ew, w, surf))
+                existing_triples.add((sw, ew, rel))
+
+    return edges, existing_triples
+
+
+def load_checkpoint():
+    """Load enhancement log to determine what's already been processed."""
+    processed = set()  # (word, phase)
+    if LOG_CSV.exists():
+        with open(LOG_CSV, newline="", encoding="utf-8") as f:
+            for row in csv.DictReader(f):
+                processed.add((row["source_word"], row["phase"]))
+    return processed
+
+
+def append_log(word, phase, edges_generated, edges_accepted, edges_duplicate, edges_oov):
+    """Append a row to the enhancement log."""
+    write_header = not LOG_CSV.exists()
+    with open(LOG_CSV, "a", newline="", encoding="utf-8") as f:
+        writer = csv.writer(f)
+        if write_header:
+            writer.writerow(["source_word", "phase", "timestamp",
+                             "edges_generated", "edges_accepted", "edges_duplicate", "edges_oov"])
+        writer.writerow([word, phase, datetime.now().isoformat(),
+                         edges_generated, edges_accepted, edges_duplicate, edges_oov])
+
+
+def append_augmented_edges(edges):
+    """Append edges to the augmented relations CSV."""
+    write_header = not AUGMENTED_CSV.exists()
+    with open(AUGMENTED_CSV, "a", newline="", encoding="utf-8") as f:
+        writer = csv.writer(f)
+        if write_header:
+            writer.writerow(["start_word", "end_word", "relation", "weight", "surface_text", "source"])
+        for e in edges:
+            writer.writerow([e["start_word"], e["end_word"], e["relation"],
+                             e["weight"], e["surface_text"], e["source"]])
+
+
+def append_candidates(candidates):
+    """Append candidate words to the candidate additions CSV."""
+    write_header = not CANDIDATE_CSV.exists()
+    with open(CANDIDATE_CSV, "a", newline="", encoding="utf-8") as f:
+        writer = csv.writer(f)
+        if write_header:
+            writer.writerow(["word", "suggested_by", "relation_context", "frequency"])
+        for c in candidates:
+            writer.writerow([c["word"], c["suggested_by"], c["relation_context"], c["frequency"]])
+
+
+# ---------------------------------------------------------------------------
+# Parsing
+# ---------------------------------------------------------------------------
+
+def parse_llm_relations(response_text, source_word):
+    """Parse structured LLM output into edge dicts.
+
+    Handles bullets, numbering, extra whitespace, multi-word targets.
+    """
+    edges = []
+    if not response_text:
+        return edges
+
+    for line in response_text.strip().split("\n"):
+        line = line.strip()
+        if not line:
+            continue
+
+        # Strip leading bullets/numbers: "- ", "1. ", "* ", etc.
+        line = re.sub(r"^[\d]+[.)]\s*", "", line)
+        line = re.sub(r"^[-*•]\s*", "", line)
+        line = line.strip()
+
+        if not line or "NONE" in line.upper():
+            continue
+
+        # Match: RELATION_TYPE: target_word(s) | surface text
+        match = re.match(r"^(\w+):\s*(.+?)\s*\|\s*(.+)$", line)
+        if not match:
+            continue
+
+        relation, target_raw, surface = match.groups()
+        relation = relation.strip()
+
+        if relation not in VALID_RELATIONS:
+            continue
+
+        # Normalize target: lowercase, replace spaces with underscores for multi-word
+        target = target_raw.strip().lower()
+        target = re.sub(r"\s+", "_", target)
+
+        # Skip self-loops
+        if target == source_word:
+            continue
+
+        edges.append({
+            "start_word": source_word,
+            "end_word": target,
+            "relation": relation,
+            "weight": 0.8,
+            "surface_text": surface.strip(),
+            "source": "llm_augmented",
+        })
+
+    return edges
+
+
+def parse_bridge_response(response_text, word_a, word_b):
+    """Parse bridge discovery LLM output."""
+    edges = []
+    if not response_text:
+        return edges
+
+    for line in response_text.strip().split("\n"):
+        line = line.strip()
+        if not line:
+            continue
+
+        # Strip common prefixes
+        line = re.sub(r"^[\d]+[.)]\s*", "", line)
+        line = re.sub(r"^[-*•]\s*", "", line)
+        line = re.sub(r"^BRIDGE:\s*", "", line, flags=re.IGNORECASE)
+        line = line.strip()
+
+        if not line:
+            continue
+
+        # BRIDGE_WORD | relation_to_first: TYPE | relation_to_second: TYPE | explanation
+        parts = [p.strip() for p in line.split("|")]
+        if len(parts) < 3:
+            continue
+
+        bridge_word = parts[0].strip().lower().replace(" ", "_")
+
+        # Parse relation_to_first
+        rel1_match = re.search(r"(?:relation_to_first|first):\s*(\w+)", parts[1], re.IGNORECASE)
+        rel2_match = re.search(r"(?:relation_to_second|second):\s*(\w+)", parts[2], re.IGNORECASE)
+
+        if not rel1_match or not rel2_match:
+            # Try simpler format: just the relation type
+            rel1_match = re.match(r"(\w+)", parts[1].split(":")[-1].strip())
+            rel2_match = re.match(r"(\w+)", parts[2].split(":")[-1].strip())
+
+        if not rel1_match or not rel2_match:
+            continue
+
+        rel1 = rel1_match.group(1)
+        rel2 = rel2_match.group(1)
+
+        if rel1 not in VALID_RELATIONS or rel2 not in VALID_RELATIONS:
+            continue
+
+        explanation = parts[3].strip() if len(parts) > 3 else ""
+
+        # Create edges: word_a -> bridge and bridge -> word_b
+        edges.append({
+            "start_word": word_a,
+            "end_word": bridge_word,
+            "relation": rel1,
+            "weight": 0.8,
+            "surface_text": explanation,
+            "source": "llm_bridge",
+        })
+        edges.append({
+            "start_word": bridge_word,
+            "end_word": word_b,
+            "relation": rel2,
+            "weight": 0.8,
+            "surface_text": explanation,
+            "source": "llm_bridge",
+        })
+
+    return edges
+
+
+def parse_property_response(response_text, word):
+    """Parse property enrichment LLM output."""
+    edges = []
+    if not response_text:
+        return edges
+
+    for line in response_text.strip().split("\n"):
+        line = line.strip()
+        if not line:
+            continue
+
+        line = re.sub(r"^[\d]+[.)]\s*", "", line)
+        line = re.sub(r"^[-*•]\s*", "", line)
+        line = line.strip()
+
+        if not line:
+            continue
+
+        # PROPERTY | explanation
+        parts = [p.strip() for p in line.split("|")]
+        if len(parts) < 1:
+            continue
+
+        prop = parts[0].strip().lower().replace(" ", "_")
+        explanation = parts[1].strip() if len(parts) > 1 else f"{word} is {prop}"
+
+        if not prop or prop == word:
+            continue
+
+        edges.append({
+            "start_word": word,
+            "end_word": prop,
+            "relation": "HasProperty",
+            "weight": 0.8,
+            "surface_text": explanation,
+            "source": "llm_property",
+        })
+
+    return edges
+
+
+# ---------------------------------------------------------------------------
+# Phase 1: Per-Word Expansion
+# ---------------------------------------------------------------------------
+
+PHASE1_SYSTEM = """You are a commonsense knowledge annotator. You will be given a concrete noun and its known relationships. Your job is to generate ADDITIONAL commonsense relationships that are missing.
+
+Rules:
+- Only generate relationships involving concrete, tangible things (animals, foods, tools, plants, buildings, weather, landscape, household objects)
+- Every relationship must be something a typical adult would agree is true
+- Do not repeat any relationship already listed as "known"
+- Target words should be common English words (top 3000 frequency preferred)
+- Output ONLY the structured format shown below, one relationship per line
+- If you cannot think of good relationships for a given type, output NONE for that type
+- Aim for 3-5 relationships per type where possible
+
+Output format (one per line):
+RELATION_TYPE: target_word | short natural phrasing
+
+Example output:
+AtLocation: barn | you find a horse in a barn
+UsedFor: riding | a horse is used for riding
+HasA: mane | a horse has a mane
+CapableOf: gallop | a horse can gallop
+MadeOf: NONE
+PartOf: herd | a horse is part of a herd"""
+
+
+PHASE1_USER = """Word: {word}
+Categories: {categories}
+
+Known relationships:
+{existing_edges}
+
+Generate additional relationships for these types:
+- AtLocation (where is it found?)
+- UsedFor (what is it used for?)
+- HasA (what does it have / contain?)
+- PartOf (what is it part of?)
+- CapableOf (what can it do?)
+- MadeOf (what is it made of?)
+- HasPrerequisite (what do you need before you can have/use it?)
+- Causes (what does it cause or lead to?)
+- HasProperty (what adjectives describe it? — limit to physical/sensory properties)"""
+
+
+def format_existing_edges(edges_dict, word):
+    """Format existing edges for a word grouped by relation type."""
+    relation_types = ["AtLocation", "UsedFor", "HasA", "PartOf", "CapableOf",
+                      "MadeOf", "HasPrerequisite", "Causes", "HasProperty"]
+
+    lines = []
+    for rel in relation_types:
+        targets = edges_dict.get((word, rel), [])
+        if targets:
+            formatted = ", ".join(f"{t[0]} (weight {t[1]:.1f})" for t in targets[:10])
+            lines.append(f"{rel}: {formatted}")
+        else:
+            lines.append(f"{rel}: (none in database)")
+    return "\n".join(lines)
+
+
+def run_phase1(vocab, edges, existing_triples, checkpoint, dry_run=False):
+    """Phase 1: Per-word relationship expansion."""
+    words = sorted(vocab.keys())
+    total = len(words)
+    total_accepted = 0
+    total_skipped = 0
+
+    print(f"Phase 1: Processing {total} words...")
+
+    for i, word in enumerate(words):
+        if (word, "1") in checkpoint:
+            total_skipped += 1
+            continue
+
+        categories = ", ".join(vocab[word]["categories"])
+        existing = format_existing_edges(edges, word)
+
+        user_prompt = PHASE1_USER.format(
+            word=word, categories=categories, existing_edges=existing
+        )
+
+        messages = [
+            {"role": "system", "content": PHASE1_SYSTEM},
+            {"role": "user", "content": user_prompt},
+        ]
+
+        if dry_run:
+            if i < 3:  # Show first 3 prompts
+                print(f"\n--- Prompt for '{word}' ---")
+                print(f"System: {PHASE1_SYSTEM[:200]}...")
+                print(f"User:\n{user_prompt}")
+            elif i == 3:
+                print(f"\n... ({total - 3} more words) ...")
+            continue
+
+        response = llm_chat_completion(messages)
+        parsed = parse_llm_relations(response, word) if response else []
+
+        # Classify edges
+        accepted = []
+        candidates = []
+        duplicates = 0
+
+        for edge in parsed:
+            triple = (edge["start_word"], edge["end_word"], edge["relation"])
+            if triple in existing_triples:
+                duplicates += 1
+                continue
+
+            existing_triples.add(triple)
+
+            if edge["end_word"] in vocab:
+                accepted.append(edge)
+            else:
+                candidates.append({
+                    "word": edge["end_word"],
+                    "suggested_by": word,
+                    "relation_context": f"{edge['relation']}: {edge['surface_text']}",
+                    "frequency": 1,
+                })
+
+        if accepted:
+            append_augmented_edges(accepted)
+            # Also update in-memory edges for subsequent words
+            for e in accepted:
+                edges[(e["start_word"], e["relation"])].append(
+                    (e["end_word"], e["weight"], e["surface_text"]))
+
+        if candidates:
+            append_candidates(candidates)
+
+        total_accepted += len(accepted)
+
+        append_log(word, "1", len(parsed), len(accepted), duplicates, len(candidates))
+
+        if (i + 1) % 50 == 0:
+            print(f"  [{i+1}/{total}] {total_accepted} edges accepted so far")
+
+        time.sleep(0.1)
+
+    if dry_run:
+        print(f"\nDry run complete. Would process {total - total_skipped} words.")
+    else:
+        print(f"\nPhase 1 complete: {total_accepted} new edges accepted.")
+
+
+# ---------------------------------------------------------------------------
+# Phase 2: Cross-Word Bridge Discovery
+# ---------------------------------------------------------------------------
+
+PHASE2_SYSTEM = """You are a commonsense knowledge annotator. You will be given two concrete nouns. Your job is to identify a BRIDGE word that connects them — something that relates to both.
+
+Rules:
+- The bridge word must be a common, concrete noun
+- State the relationship type for each connection
+- Valid relationship types: AtLocation, UsedFor, HasA, PartOf, CapableOf, MadeOf, HasPrerequisite, Causes, HasProperty, ReceivesAction, Desires, CausesDesire, LocatedNear, CreatedBy
+- Output format: BRIDGE_WORD | relation_to_first: TYPE | relation_to_second: TYPE | explanation
+
+Example:
+Words: "cow" and "butter"
+milk | relation_to_first: CapableOf | relation_to_second: MadeOf | milk connects production to product"""
+
+
+PHASE2_USER = """Words: "{word_a}" and "{word_b}"
+Categories: {word_a} is {categories_a}, {word_b} is {categories_b}
+Find 1-3 bridge words that connect them."""
+
+
+def build_reachability(vocab, edges):
+    """Build 2-hop reachability from vocab words to other vocab words."""
+    vocab_set = set(vocab.keys())
+    reachable = defaultdict(set)  # word -> set of reachable vocab words
+
+    for word in vocab:
+        # Direct (1-hop) neighbors in vocab
+        for (sw, rel), targets in edges.items():
+            if sw == word:
+                for (ew, w, s) in targets:
+                    if ew in vocab_set and ew != word:
+                        reachable[word].add(ew)
+                        # 2-hop from this neighbor
+                        for (sw2, rel2), targets2 in edges.items():
+                            if sw2 == ew:
+                                for (ew2, w2, s2) in targets2:
+                                    if ew2 in vocab_set and ew2 != word:
+                                        reachable[word].add(ew2)
+
+    return reachable
+
+
+def run_phase2(vocab, edges, existing_triples, checkpoint, dry_run=False):
+    """Phase 2: Cross-word bridge discovery."""
+    print("Phase 2: Building reachability matrix...")
+    reachable = build_reachability(vocab, edges)
+
+    # Find low-connectivity words
+    vocab_set = set(vocab.keys())
+    low_connectivity = []
+    for word in vocab:
+        reach_count = len(reachable.get(word, set()))
+        if reach_count < 10:
+            low_connectivity.append((word, reach_count))
+
+    low_connectivity.sort(key=lambda x: x[1])
+    print(f"  {len(low_connectivity)} words with <10 reachable vocab words")
+
+    # Build category index
+    by_category = defaultdict(list)
+    for word, info in vocab.items():
+        for cat in info["categories"]:
+            by_category[cat].append(word)
+
+    total_accepted = 0
+    pairs_processed = 0
+    total_skipped = 0
+
+    for word, reach_count in low_connectivity:
+        if (word, "2") in checkpoint:
+            total_skipped += 1
+            continue
+
+        word_cats = vocab[word]["categories"]
+        word_reachable = reachable.get(word, set())
+
+        # Find same-category words that are unreachable
+        unreachable = []
+        for cat in word_cats:
+            for peer in by_category.get(cat, []):
+                if peer != word and peer not in word_reachable:
+                    unreachable.append(peer)
+
+        if not unreachable:
+            append_log(word, "2", 0, 0, 0, 0)
+            continue
+
+        # Sample 5-10 unreachable peers
+        sample = random.sample(unreachable, min(10, len(unreachable)))
+
+        accepted_for_word = 0
+
+        for peer in sample:
+            pair_key = f"{word}:{peer}"
+            if (pair_key, "2") in checkpoint:
+                continue
+
+            categories_a = ", ".join(vocab[word]["categories"])
+            categories_b = ", ".join(vocab[peer]["categories"])
+
+            user_prompt = PHASE2_USER.format(
+                word_a=word, word_b=peer,
+                categories_a=categories_a, categories_b=categories_b,
+            )
+
+            messages = [
+                {"role": "system", "content": PHASE2_SYSTEM},
+                {"role": "user", "content": user_prompt},
+            ]
+
+            if dry_run:
+                if pairs_processed < 3:
+                    print(f"\n--- Bridge prompt: '{word}' <-> '{peer}' ---")
+                    print(f"User:\n{user_prompt}")
+                elif pairs_processed == 3:
+                    print(f"\n... (more pairs) ...")
+                pairs_processed += 1
+                continue
+
+            response = llm_chat_completion(messages)
+            parsed = parse_bridge_response(response, word, peer) if response else []
+
+            accepted = []
+            duplicates = 0
+            oov = 0
+
+            for edge in parsed:
+                triple = (edge["start_word"], edge["end_word"], edge["relation"])
+                if triple in existing_triples:
+                    duplicates += 1
+                    continue
+                existing_triples.add(triple)
+
+                # For bridge edges, both endpoints should ideally be in vocab
+                if edge["start_word"] in vocab_set and edge["end_word"] in vocab_set:
+                    accepted.append(edge)
+                elif edge["start_word"] in vocab_set or edge["end_word"] in vocab_set:
+                    # At least one end in vocab — still useful
+                    accepted.append(edge)
+                else:
+                    oov += 1
+
+            if accepted:
+                append_augmented_edges(accepted)
+                for e in accepted:
+                    edges[(e["start_word"], e["relation"])].append(
+                        (e["end_word"], e["weight"], e["surface_text"]))
+                accepted_for_word += len(accepted)
+
+            pairs_processed += 1
+            time.sleep(0.1)
+
+        total_accepted += accepted_for_word
+        append_log(word, "2", 0, accepted_for_word, 0, 0)
+
+        if (pairs_processed) % 20 == 0:
+            print(f"  {pairs_processed} pairs processed, {total_accepted} edges accepted")
+
+    if dry_run:
+        print(f"\nDry run complete. Would process {pairs_processed} word pairs.")
+    else:
+        print(f"\nPhase 2 complete: {total_accepted} bridge edges accepted from {pairs_processed} pairs.")
+
+
+# ---------------------------------------------------------------------------
+# Phase 3: Property Enrichment
+# ---------------------------------------------------------------------------
+
+PHASE3_SYSTEM = """You are a commonsense knowledge annotator. Given a concrete noun, list its most distinctive physical or sensory properties — things you could see, touch, hear, smell, or taste. Also list behavioral properties for animals.
+
+Rules:
+- Only physical/sensory/behavioral properties, not abstract qualities
+- Properties should DISTINGUISH this thing from similar things in its category
+- Output one property per line as: PROPERTY | brief explanation
+- Aim for 5-8 properties"""
+
+
+PHASE3_USER = """Word: {word}
+Category: {categories}
+Other words in same category: {peers}
+
+What properties distinguish {word} from the others listed?"""
+
+
+def run_phase3(vocab, edges, existing_triples, checkpoint, dry_run=False):
+    """Phase 3: Property enrichment for false_equivalence templates."""
+    by_category = defaultdict(list)
+    for word, info in vocab.items():
+        for cat in info["categories"]:
+            by_category[cat].append(word)
+
+    words = sorted(vocab.keys())
+    total = len(words)
+    total_accepted = 0
+    total_skipped = 0
+
+    print(f"Phase 3: Property enrichment for {total} words...")
+
+    for i, word in enumerate(words):
+        if (word, "3") in checkpoint:
+            total_skipped += 1
+            continue
+
+        word_cats = vocab[word]["categories"]
+        categories = ", ".join(word_cats)
+
+        # Gather same-category peers (sample of 10)
+        peers = set()
+        for cat in word_cats:
+            for peer in by_category.get(cat, []):
+                if peer != word:
+                    peers.add(peer)
+        peer_sample = random.sample(list(peers), min(10, len(peers))) if peers else []
+
+        if not peer_sample:
+            append_log(word, "3", 0, 0, 0, 0)
+            continue
+
+        user_prompt = PHASE3_USER.format(
+            word=word, categories=categories,
+            peers=", ".join(peer_sample),
+        )
+
+        messages = [
+            {"role": "system", "content": PHASE3_SYSTEM},
+            {"role": "user", "content": user_prompt},
+        ]
+
+        if dry_run:
+            if i < 3:
+                print(f"\n--- Property prompt for '{word}' ---")
+                print(f"User:\n{user_prompt}")
+            elif i == 3:
+                print(f"\n... ({total - 3} more words) ...")
+            continue
+
+        response = llm_chat_completion(messages)
+        parsed = parse_property_response(response, word) if response else []
+
+        accepted = []
+        duplicates = 0
+
+        for edge in parsed:
+            triple = (edge["start_word"], edge["end_word"], edge["relation"])
+            if triple in existing_triples:
+                duplicates += 1
+                continue
+            existing_triples.add(triple)
+            accepted.append(edge)
+
+        if accepted:
+            append_augmented_edges(accepted)
+            for e in accepted:
+                edges[(e["start_word"], e["relation"])].append(
+                    (e["end_word"], e["weight"], e["surface_text"]))
+
+        total_accepted += len(accepted)
+        append_log(word, "3", len(parsed), len(accepted), duplicates, 0)
+
+        if (i + 1) % 50 == 0:
+            print(f"  [{i+1}/{total}] {total_accepted} properties accepted so far")
+
+        time.sleep(0.1)
+
+    if dry_run:
+        print(f"\nDry run complete. Would process {total - total_skipped} words.")
+    else:
+        print(f"\nPhase 3 complete: {total_accepted} new HasProperty edges accepted.")
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="LLM-augmented graph enhancement for folksy subgraph."
+    )
+    group = parser.add_mutually_exclusive_group(required=True)
+    group.add_argument("--phase", type=int, choices=[1, 2, 3],
+                       help="Run a specific phase (1, 2, or 3)")
+    group.add_argument("--all", action="store_true",
+                       help="Run all three phases in sequence")
+    parser.add_argument("--dry-run", action="store_true",
+                        help="Print prompts without calling LLM")
+
+    args = parser.parse_args()
+
+    vocab = load_vocab()
+    edges, existing_triples = load_relations()
+    checkpoint = load_checkpoint()
+
+    print(f"Loaded {len(vocab)} vocab words, {len(existing_triples)} existing edge triples.")
+    print(f"Checkpoint: {len(checkpoint)} (word, phase) pairs already processed.")
+
+    phases = [args.phase] if args.phase else [1, 2, 3]
+
+    for phase in phases:
+        print(f"\n{'='*60}")
+        print(f"Running Phase {phase}")
+        print(f"{'='*60}")
+
+        if phase == 1:
+            run_phase1(vocab, edges, existing_triples, checkpoint, args.dry_run)
+        elif phase == 2:
+            run_phase2(vocab, edges, existing_triples, checkpoint, args.dry_run)
+        elif phase == 3:
+            run_phase3(vocab, edges, existing_triples, checkpoint, args.dry_run)
+
+        # Reload checkpoint after each phase for resumability
+        checkpoint = load_checkpoint()
+
+    print("\nDone.")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/expand_vocab.py
+++ b/scripts/expand_vocab.py
@ -0,0 +1,512 @@
+#!/usr/bin/env python3
+"""Expand folksy vocabulary with high-quality candidates from LLM suggestions.
+
+Reads candidate_additions.csv (words suggested by the LLM during phase 1 that
+weren't in the vocab), filters for quality, uses the LLM to assign categories,
+and appends the survivors to folksy_vocab.csv.
+
+After running this, re-run `enhance_graph.py --phase 1` to generate edges
+for the new words (the checkpoint will skip already-processed words).
+
+Usage:
+  python scripts/expand_vocab.py                  # Full run
+  python scripts/expand_vocab.py --dry-run         # Show what would be added
+  python scripts/expand_vocab.py --min-citations 8 # Stricter threshold
+"""
+
+import argparse
+import csv
+import json
+import re
+import shutil
+import sys
+import time
+from collections import Counter, defaultdict
+from datetime import datetime
+from pathlib import Path
+
+SCRIPT_DIR = Path(__file__).parent
+PROJECT_DIR = SCRIPT_DIR.parent
+DATA_DIR = PROJECT_DIR / "data"
+
+LLM_ENDPOINT = "http://192.168.1.100:8853/v1d/chat/completions"
+LLM_MODEL = "THUDM-GLM4-32B"
+
+VOCAB_CSV = DATA_DIR / "folksy_vocab.csv"
+CANDIDATE_CSV = DATA_DIR / "candidate_additions.csv"
+
+# Valid categories from the existing vocabulary
+VALID_CATEGORIES = {
+    "animal", "beverage", "bird", "building", "clothing", "container", "crop",
+    "fabric", "fish", "flower", "food", "fruit", "furniture", "grain", "herb",
+    "insect", "instrument", "landscape", "material", "metal", "mineral",
+    "organism", "plant", "rock", "seed", "shelter", "spice", "stone",
+    "structure", "tool", "tree", "vegetable", "vehicle", "water", "weapon", "wood",
+}
+
+# ---------------------------------------------------------------------------
+# Exclusion lists
+# ---------------------------------------------------------------------------
+
+# Abstract concepts, emotions, processes — not concrete enough for folksy vocab
+EXCLUDE_ABSTRACT = {
+    "ecosystem", "satisfaction", "fullness", "warmth", "fear", "relaxation",
+    "growth", "interest", "nature", "protection", "digestion", "injury",
+    "decoration", "construction", "landscape", "noise", "sound", "energy",
+    "nourishment", "nutrition", "pollination", "sustainability", "tradition",
+    "biodiversity", "symbolism", "elegance", "resilience", "patience",
+    "beauty", "abundance", "fertility", "creativity", "harmony", "comfort",
+    "curiosity", "companionship", "loyalty", "aggression", "alertness",
+    "camouflage", "predation", "migration", "hibernation", "decomposition",
+    "erosion", "combustion", "fermentation", "oxidation", "corrosion",
+    "photosynthesis", "respiration", "evaporation", "precipitation",
+    "transpiration", "germination", "excitement", "enjoyment", "satiety",
+    "stability", "organization", "fragrance", "moisture", "wildlife",
+    "preservation", "conversation", "inspiration", "storage", "observation",
+    "hydration", "destruction", "entertainment", "education", "knowledge",
+    "safety", "practice", "research", "skill", "space", "license",
+    "collection", "habitat", "pollution", "health", "vibration", "wonder",
+    "awe", "refreshment", "irritation", "happiness", "joy", "damage",
+    "death", "pain", "thirst", "fear", "alarm", "contents", "ingredients",
+    "electricity", "oxygen", "navigation", "recreation", "meditation",
+    "nutrition", "celebration", "communication", "imagination", "devotion",
+    "ambition", "endurance", "independence", "discipline", "cooperation",
+    "sweetness", "fullness", "aroma", "flavor", "fragrance", "texture",
+    "smell", "color", "contents", "surface", "bottom", "edge",
+    "nutrients", "study", "outfit", "upholstery",
+}
+
+# Scientific/technical — not folksy enough for folk wisdom
+EXCLUDE_TECHNICAL = {
+    "cellulose", "exoskeleton", "protein", "tissue", "cells", "alloy",
+    "cellulose", "enzyme", "chlorophyll", "genome", "photon",
+    "organism", "molecule", "compound", "polymer", "isotope",
+    "ecosystem", "metabolism", "catalyst", "membrane", "chromosome",
+    "cell", "nutrient", "ingredient", "material", "content",
+}
+
+# Collective/institutional nouns — not concrete individual things
+EXCLUDE_INSTITUTIONAL = {
+    "orchestra", "fleet", "arsenal", "toolkit", "collection",
+    "restaurant", "museum", "university", "corporation", "organization",
+    "musician", "breakfast", "dinner", "meal", "dish", "sandwich",
+    "seafood", "refrigerator", "garage", "basement", "park",
+}
+
+# Adjectives and properties — useful as HasProperty targets but not as vocab words
+EXCLUDE_ADJECTIVES = {
+    "small", "large", "heavy", "colorful", "green", "brown", "hard",
+    "white", "round", "sharp", "sturdy", "long", "soft", "flat",
+    "sweet", "bitter", "smooth", "rough", "bright", "dark", "dry",
+    "wet", "thick", "thin", "warm", "cold", "hot", "tall", "short",
+    "red", "blue", "yellow", "black", "grey", "gray", "pink",
+    "fragrant", "loud", "spicy", "sour", "tough", "delicate", "strong",
+    "weak", "light", "dense", "portable", "lightweight", "transparent",
+    "opaque", "flexible", "rigid", "brittle", "elastic", "porous",
+    "compact", "edible", "toxic", "aromatic", "nocturnal", "aquatic",
+    "durable", "cylindrical", "wooden", "shiny", "solid", "narrow",
+    "metallic", "pungent", "juicy", "fast", "powerful", "woody",
+    "fibrous", "savory", "liquid", "enclosed", "rectangular", "wild",
+    "feathered", "leafy", "crunchy", "dangerous", "fuzzy", "slimy",
+    "natural", "waterproof", "electronic",
+}
+
+# Words that are clearly verbs or gerunds
+EXCLUDE_VERBS = {
+    "eating", "cooking", "growing", "fishing", "hunting", "flying",
+    "mining", "flavoring", "singing", "blooming", "holding", "baking",
+    "ripening", "opening", "cutting", "protecting", "seasoning",
+    "storing", "building", "swimming", "brewing", "weaving", "carving",
+    "climbing", "digging", "plowing", "sewing", "spinning", "tanning",
+    "swim", "run", "grow", "eat", "hunt", "peck", "bite", "dive",
+    "crawl", "cut", "shine", "sparkle",
+}
+
+
+def singularize(word):
+    """Best-effort singularization. Returns (singular, was_plural)."""
+    # Irregular plurals
+    irregulars = {
+        "teeth": "tooth", "feet": "foot", "geese": "goose", "mice": "mouse",
+        "lice": "louse", "dice": "die", "oxen": "ox", "children": "child",
+        "leaves": "leaf", "loaves": "loaf", "halves": "half", "knives": "knife",
+        "lives": "life", "wives": "wife", "wolves": "wolf", "shelves": "shelf",
+        "calves": "calf",
+    }
+    if word in irregulars:
+        return irregulars[word], True
+
+    # -ves -> -f (already covered some above, catch remaining)
+    if word.endswith("ves"):
+        candidate = word[:-3] + "f"
+        return candidate, True
+
+    # -ies -> -y
+    if word.endswith("ies") and len(word) > 4:
+        return word[:-3] + "y", True
+
+    # -ses, -xes, -zes, -ches, -shes -> drop -es
+    if word.endswith(("ses", "xes", "zes", "ches", "shes")):
+        return word[:-2], True
+
+    # -s (but not -ss, -us, -is)
+    if word.endswith("s") and not word.endswith(("ss", "us", "is")):
+        return word[:-1], True
+
+    return word, False
+
+
+def is_plural_of_existing(word, existing_vocab):
+    """Check if word is likely a plural form of an existing vocab word."""
+    # word + s
+    if word.endswith("s") and word[:-1] in existing_vocab:
+        return True
+    # word + es
+    if word.endswith("es") and word[:-2] in existing_vocab:
+        return True
+    # word ending ies -> y
+    if word.endswith("ies") and word[:-3] + "y" in existing_vocab:
+        return True
+    # word ending ves -> f/fe
+    if word.endswith("ves"):
+        if word[:-3] + "f" in existing_vocab:
+            return True
+        if word[:-3] + "fe" in existing_vocab:
+            return True
+    return False
+
+
+def is_plural_of_candidate(word, accepted_words):
+    """Check if word is a plural of another candidate, or vice versa."""
+    # Is this word a plural of something accepted?
+    if word.endswith("s") and word[:-1] in accepted_words:
+        return True
+    if word.endswith("es") and word[:-2] in accepted_words:
+        return True
+    if word.endswith("ies") and word[:-3] + "y" in accepted_words:
+        return True
+    # Is something accepted a plural of this word?
+    if word + "s" in accepted_words:
+        return True
+    if word + "es" in accepted_words:
+        return True
+    if word.endswith("f") and word[:-1] + "ves" in accepted_words:
+        return True
+    if word.endswith("fe") and word[:-2] + "ves" in accepted_words:
+        return True
+    return False
+
+
+# ---------------------------------------------------------------------------
+# LLM categorization
+# ---------------------------------------------------------------------------
+
+CATEGORIZE_SYSTEM = """You are a vocabulary categorizer. Given a list of concrete nouns, assign each one to one or more categories from this fixed list:
+
+animal, beverage, bird, building, clothing, container, crop, fabric, fish, flower, food, fruit, furniture, grain, herb, insect, instrument, landscape, material, metal, mineral, organism, plant, rock, seed, shelter, spice, stone, structure, tool, tree, vegetable, vehicle, water, weapon, wood
+
+Rules:
+- Use ONLY categories from the list above
+- A word can have multiple categories (e.g., "brick" -> material, stone)
+- If a word fits none of the categories well, output SKIP
+- Output format: word: category1, category2
+- One word per line"""
+
+CATEGORIZE_USER = """Categorize these words:
+{word_list}"""
+
+
+def llm_chat_completion(messages, max_retries=3):
+    """Chat completion with retry logic."""
+    import requests
+
+    for attempt in range(max_retries):
+        try:
+            resp = requests.post(LLM_ENDPOINT, json={
+                "model": LLM_MODEL,
+                "messages": messages,
+            }, timeout=120)
+            resp.raise_for_status()
+            data = resp.json()
+            return data["choices"][0]["message"]["content"]
+        except Exception as e:
+            wait = (2 ** attempt)
+            print(f"  LLM call failed (attempt {attempt+1}/{max_retries}): {e}",
+                  file=sys.stderr)
+            if attempt < max_retries - 1:
+                print(f"  Retrying in {wait}s...", file=sys.stderr)
+                time.sleep(wait)
+            else:
+                print(f"  Giving up on this batch.", file=sys.stderr)
+                return None
+
+
+def parse_categories(response_text, valid_words):
+    """Parse LLM categorization response."""
+    result = {}
+    if not response_text:
+        return result
+
+    for line in response_text.strip().split("\n"):
+        line = line.strip()
+        if not line:
+            continue
+
+        # Strip bullets/numbers
+        line = re.sub(r"^[\d]+[.)]\s*", "", line)
+        line = re.sub(r"^[-*•]\s*", "", line)
+        line = line.strip()
+
+        # Match: word: cat1, cat2
+        match = re.match(r"^(\w+)\s*:\s*(.+)$", line)
+        if not match:
+            continue
+
+        word = match.group(1).strip().lower()
+        cats_raw = match.group(2).strip()
+
+        if "SKIP" in cats_raw.upper():
+            continue
+
+        cats = []
+        for c in cats_raw.split(","):
+            c = c.strip().lower()
+            if c in VALID_CATEGORIES:
+                cats.append(c)
+
+        if word in valid_words and cats:
+            result[word] = cats
+
+    return result
+
+
+def categorize_words(words, batch_size=25):
+    """Categorize words using the LLM in batches."""
+    all_categories = {}
+    word_set = set(words)
+
+    for i in range(0, len(words), batch_size):
+        batch = words[i:i + batch_size]
+        word_list = "\n".join(f"- {w}" for w in batch)
+
+        messages = [
+            {"role": "system", "content": CATEGORIZE_SYSTEM},
+            {"role": "user", "content": CATEGORIZE_USER.format(word_list=word_list)},
+        ]
+
+        response = llm_chat_completion(messages)
+        parsed = parse_categories(response, word_set)
+        all_categories.update(parsed)
+
+        categorized = len(parsed)
+        print(f"  Batch {i // batch_size + 1}: {categorized}/{len(batch)} categorized")
+        time.sleep(0.1)
+
+    return all_categories
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Expand folksy vocabulary with LLM-suggested candidates."
+    )
+    parser.add_argument("--min-citations", type=int, default=5,
+                        help="Minimum number of vocab words that suggested this candidate (default: 5)")
+    parser.add_argument("--dry-run", action="store_true",
+                        help="Show what would be added without modifying files")
+    parser.add_argument("--no-llm", action="store_true",
+                        help="Skip LLM categorization (use placeholder categories)")
+
+    args = parser.parse_args()
+
+    # Load existing vocab
+    existing_vocab = {}
+    with open(VOCAB_CSV, newline="", encoding="utf-8") as f:
+        for row in csv.DictReader(f):
+            existing_vocab[row["word"]] = row
+    existing_words = set(existing_vocab.keys())
+    print(f"Existing vocabulary: {len(existing_words)} words")
+
+    # Load candidates
+    candidates = []
+    with open(CANDIDATE_CSV, newline="", encoding="utf-8") as f:
+        for row in csv.DictReader(f):
+            candidates.append(row)
+
+    # Aggregate: count unique sources per candidate word
+    word_sources = defaultdict(set)
+    for c in candidates:
+        word_sources[c["word"]].add(c["suggested_by"])
+
+    print(f"Total candidate rows: {len(candidates)}")
+    print(f"Unique candidate words: {len(word_sources)}")
+
+    # Normalize plurals: merge citation counts into singular forms
+    normalized_sources = defaultdict(set)
+    for word, sources in word_sources.items():
+        singular, was_plural = singularize(word)
+        # Merge into the singular form
+        normalized_sources[singular].update(sources)
+    # Replace word_sources with normalized version
+    word_sources = {w: srcs for w, srcs in normalized_sources.items()}
+    print(f"After singularization: {len(word_sources)} unique candidates")
+
+    # Filter
+    accepted = []
+    reject_reasons = Counter()
+
+    # Sort by citation count descending for consistent ordering
+    sorted_candidates = sorted(word_sources.items(), key=lambda x: len(x[1]), reverse=True)
+    accepted_set = set()
+
+    for word, sources in sorted_candidates:
+        citation_count = len(sources)
+
+        # Minimum citation threshold
+        if citation_count < args.min_citations:
+            reject_reasons["below_threshold"] += 1
+            continue
+
+        # No multi-word (underscore) candidates
+        if "_" in word:
+            reject_reasons["multi_word"] += 1
+            continue
+
+        # Already in vocab
+        if word in existing_words:
+            reject_reasons["already_in_vocab"] += 1
+            continue
+
+        # Exclude abstracts
+        if word in EXCLUDE_ABSTRACT:
+            reject_reasons["abstract"] += 1
+            continue
+
+        # Exclude adjectives
+        if word in EXCLUDE_ADJECTIVES:
+            reject_reasons["adjective"] += 1
+            continue
+
+        # Exclude verbs/gerunds
+        if word in EXCLUDE_VERBS:
+            reject_reasons["verb_gerund"] += 1
+            continue
+
+        # Exclude technical/scientific
+        if word in EXCLUDE_TECHNICAL:
+            reject_reasons["technical"] += 1
+            continue
+
+        # Exclude institutional/collective
+        if word in EXCLUDE_INSTITUTIONAL:
+            reject_reasons["institutional"] += 1
+            continue
+
+        # Gerund pattern catch-all (but allow exceptions)
+        if word.endswith("ing") and word not in {"ring", "spring", "string", "wing", "ceiling"}:
+            reject_reasons["gerund_pattern"] += 1
+            continue
+
+        # Exclude plurals of existing vocab
+        if is_plural_of_existing(word, existing_words):
+            reject_reasons["plural_of_existing"] += 1
+            continue
+
+        # Exclude plurals of already-accepted candidates
+        if is_plural_of_candidate(word, accepted_set):
+            reject_reasons["plural_of_candidate"] += 1
+            continue
+
+        # Single character
+        if len(word) < 2:
+            reject_reasons["too_short"] += 1
+            continue
+
+        accepted.append((word, citation_count))
+        accepted_set.add(word)
+
+    print(f"\nFiltering results:")
+    print(f"  Accepted: {len(accepted)}")
+    for reason, count in reject_reasons.most_common():
+        print(f"  Rejected ({reason}): {count}")
+
+    if not accepted:
+        print("\nNo candidates passed filtering.")
+        return
+
+    # Show accepted words
+    print(f"\nAccepted candidates ({len(accepted)}):")
+    for word, count in accepted:
+        print(f"  {word:25s} cited by {count:3d} vocab words")
+
+    if args.dry_run:
+        print(f"\nDry run complete. Would add {len(accepted)} words to vocabulary.")
+        return
+
+    # Categorize with LLM
+    words_to_categorize = [w for w, _ in accepted]
+
+    if args.no_llm:
+        print("\nSkipping LLM categorization (--no-llm). Using 'material' as placeholder.")
+        categories = {w: ["material"] for w in words_to_categorize}
+    else:
+        print(f"\nCategorizing {len(words_to_categorize)} words with LLM...")
+        categories = categorize_words(words_to_categorize)
+
+    # Words the LLM couldn't categorize get skipped
+    uncategorized = [w for w in words_to_categorize if w not in categories]
+    if uncategorized:
+        print(f"\n  {len(uncategorized)} words could not be categorized (skipped):")
+        for w in uncategorized:
+            print(f"    {w}")
+
+    # Build new vocab entries
+    new_entries = []
+    for word, citation_count in accepted:
+        if word not in categories:
+            continue
+        cats = categories[word]
+        new_entries.append({
+            "word": word,
+            "categories": ",".join(cats),
+            "tangibility_score": "0.80",
+            "conceptnet_edge_count": "0",
+            "frequency_rank": "0",
+        })
+
+    if not new_entries:
+        print("\nNo entries to add after categorization.")
+        return
+
+    # Backup existing vocab
+    backup_path = VOCAB_CSV.with_suffix(f".csv.bak.{datetime.now().strftime('%Y%m%d_%H%M%S')}")
+    shutil.copy2(VOCAB_CSV, backup_path)
+    print(f"\nBacked up vocabulary to {backup_path.name}")
+
+    # Append to vocab CSV
+    with open(VOCAB_CSV, "a", newline="", encoding="utf-8") as f:
+        writer = csv.DictWriter(f, fieldnames=["word", "categories", "tangibility_score",
+                                                "conceptnet_edge_count", "frequency_rank"])
+        for entry in new_entries:
+            writer.writerow(entry)
+
+    print(f"\nAdded {len(new_entries)} words to {VOCAB_CSV.name}")
+    print(f"New vocabulary size: {len(existing_words) + len(new_entries)}")
+
+    # Summary by category
+    cat_counts = Counter()
+    for entry in new_entries:
+        for c in entry["categories"].split(","):
+            cat_counts[c.strip()] += 1
+    print(f"\nNew words by category:")
+    for cat, count in cat_counts.most_common():
+        print(f"  {cat:20s} {count:3d}")
+
+    print(f"\nNext step: run 'python scripts/enhance_graph.py --phase 1' to generate edges for new words.")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/filter_corpus.py
+++ b/scripts/filter_corpus.py
@ -0,0 +1,177 @@
+#!/usr/bin/env python3
+"""Quality filtering for polished folksy sayings.
+
+Reads corpus_polished.jsonl, applies quality filters, outputs filtered corpus
+and discard analysis.
+
+Usage:
+  python scripts/filter_corpus.py
+  python scripts/filter_corpus.py --input corpus/corpus_polished.jsonl --output corpus/corpus_filtered.jsonl
+"""
+
+import argparse
+import csv
+import json
+import sys
+from difflib import SequenceMatcher
+from pathlib import Path
+
+SCRIPT_DIR = Path(__file__).parent
+PROJECT_DIR = SCRIPT_DIR.parent
+CORPUS_DIR = PROJECT_DIR / "corpus"
+
+
+def quality_filter(entry):
+    """Apply quality filters to a polished entry.
+
+    Returns (passed, reason) tuple.
+    """
+    text = entry.get("polished_text", "")
+    if not text:
+        return False, "no_polished_text"
+
+    words = text.split()
+
+    # Length check
+    if len(words) > 25:
+        return False, "too_long"
+    if len(words) < 5:
+        return False, "too_short"
+
+    # Must contain at least 2 of the original slot-fill nouns
+    slot_words = set(entry.get("slots", {}).values())
+    words_present = sum(1 for w in slot_words if w.lower() in text.lower())
+    if words_present < 2:
+        return False, "lost_key_nouns"
+
+    # No raw ConceptNet artifacts (multi-word underscore phrases)
+    if "_" in text:
+        return False, "conceptnet_artifact"
+
+    # No broken templates (unfilled slots)
+    if "{" in text or "}" in text:
+        return False, "unfilled_slot"
+
+    return True, "pass"
+
+
+def is_near_duplicate(text_a, text_b, threshold=0.75):
+    """Check if two texts are near-duplicates."""
+    return SequenceMatcher(None, text_a.lower(), text_b.lower()).ratio() > threshold
+
+
+def deduplicate_within_family(entries):
+    """Remove near-duplicates within each meta-template family.
+
+    Returns (kept, removed) lists.
+    """
+    by_family = {}
+    for entry in entries:
+        family = entry.get("meta_template", "unknown")
+        by_family.setdefault(family, []).append(entry)
+
+    kept = []
+    removed = []
+
+    for family, family_entries in by_family.items():
+        family_kept = []
+        for entry in family_entries:
+            text = entry.get("polished_text", "")
+            is_dup = False
+            for existing in family_kept:
+                if is_near_duplicate(text, existing.get("polished_text", "")):
+                    is_dup = True
+                    break
+            if is_dup:
+                removed.append((entry, "near_duplicate"))
+            else:
+                family_kept.append(entry)
+        kept.extend(family_kept)
+
+    return kept, removed
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Quality filtering for polished folksy sayings.")
+    parser.add_argument("--input", default=str(CORPUS_DIR / "corpus_polished.jsonl"),
+                        help="Input polished JSONL file")
+    parser.add_argument("--output", default=str(CORPUS_DIR / "corpus_filtered.jsonl"),
+                        help="Output filtered JSONL file")
+    parser.add_argument("--discard-analysis", default=str(CORPUS_DIR / "discard_analysis.csv"),
+                        help="Discard analysis CSV file")
+    args = parser.parse_args()
+
+    input_path = Path(args.input)
+    output_path = Path(args.output)
+    discard_path = Path(args.discard_analysis)
+
+    if not input_path.exists():
+        print(f"Error: {input_path} not found.", file=sys.stderr)
+        sys.exit(1)
+
+    # Load polished entries (only those with status=polished)
+    all_entries = []
+    already_discarded = 0
+    with open(input_path, encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            entry = json.loads(line)
+            if entry.get("status") == "polished":
+                all_entries.append(entry)
+            elif entry.get("status") == "discarded":
+                already_discarded += 1
+
+    print(f"Loaded {len(all_entries)} polished entries ({already_discarded} already discarded by LLM)")
+
+    # Apply quality filters
+    passed = []
+    discards = []  # (entry, reason)
+
+    for entry in all_entries:
+        ok, reason = quality_filter(entry)
+        if ok:
+            passed.append(entry)
+        else:
+            discards.append((entry, reason))
+
+    print(f"Quality filter: {len(passed)} passed, {len(discards)} discarded")
+
+    # Show discard breakdown
+    from collections import Counter
+    reason_counts = Counter(r for _, r in discards)
+    for reason, count in reason_counts.most_common():
+        print(f"  {reason}: {count}")
+
+    # Near-duplicate detection within template families
+    kept, dup_removed = deduplicate_within_family(passed)
+    discards.extend(dup_removed)
+
+    print(f"Near-duplicate removal: {len(dup_removed)} removed, {len(kept)} remaining")
+
+    # Write filtered output
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(output_path, "w", encoding="utf-8") as f:
+        for entry in kept:
+            f.write(json.dumps(entry, ensure_ascii=False) + "\n")
+
+    print(f"\nFiltered corpus: {len(kept)} entries -> {output_path}")
+
+    # Write discard analysis
+    with open(discard_path, "w", newline="", encoding="utf-8") as f:
+        writer = csv.writer(f)
+        writer.writerow(["raw_text", "meta_template", "discard_stage", "discard_reason"])
+        for entry, reason in discards:
+            writer.writerow([
+                entry.get("raw_text", ""),
+                entry.get("meta_template", ""),
+                "llm_polish" if reason == "no_polished_text" else "quality_filter",
+                reason,
+            ])
+
+    print(f"Discard analysis: {len(discards)} entries -> {discard_path}")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/format_training_pairs.py
+++ b/scripts/format_training_pairs.py
@ -0,0 +1,385 @@
+#!/usr/bin/env python3
+"""Format filtered sayings into training pairs for fine-tuning.
+
+Each polished saying generates 3-5 training pairs with different input framings.
+Also generates fictional entity training pairs.
+
+Usage:
+  python scripts/format_training_pairs.py
+  python scripts/format_training_pairs.py --input corpus/corpus_filtered.jsonl --output corpus/training_pairs.jsonl
+"""
+
+import argparse
+import csv
+import json
+import random
+import sys
+from pathlib import Path
+
+SCRIPT_DIR = Path(__file__).parent
+PROJECT_DIR = SCRIPT_DIR.parent
+CORPUS_DIR = PROJECT_DIR / "corpus"
+DATA_DIR = PROJECT_DIR / "data"
+EXAMPLES_DIR = PROJECT_DIR / "examples"
+
+# Template name mappings for human-readable prompts
+TEMPLATE_NAMES = {
+    "deconstruction": "deconstruction",
+    "denial_of_consequences": "denial of consequences",
+    "ironic_deficiency": "ironic deficiency",
+    "futile_preparation": "futile preparation",
+    "hypocritical_complaint": "hypocritical complaint",
+    "tautological_wisdom": "tautological wisdom",
+    "false_equivalence": "false equivalence",
+}
+
+PERSONAS = ["farmer", "grandmother", "old sailor", "blacksmith", "innkeeper", "shepherd"]
+
+OPEN_ENDED_PROMPTS = [
+    "Tell me some folk wisdom.",
+    "What do they say?",
+    "Give me a proverb.",
+    "Share some old-time wisdom.",
+    "What's a good saying?",
+]
+
+# Auto-generated fictional entities for additional training pairs
+AUTO_ENTITIES = [
+    {
+        "name": "Stoneclaw",
+        "categories": ["animal", "predator"],
+        "properties": ["fierce", "rocky", "nocturnal"],
+        "relations": {"AtLocation": ["cave", "mountain"], "HasA": ["claws", "scales"], "CapableOf": ["hunting", "climbing"]},
+    },
+    {
+        "name": "Duskmelon",
+        "categories": ["fruit", "food"],
+        "properties": ["purple", "sweet", "fragrant"],
+        "relations": {"AtLocation": ["garden", "market"], "UsedFor": ["eating", "jam"], "MadeOf": ["seed", "juice"]},
+    },
+    {
+        "name": "Windloom",
+        "categories": ["tool", "craft"],
+        "properties": ["wooden", "portable", "intricate"],
+        "relations": {"UsedFor": ["weaving", "thread"], "MadeOf": ["wood", "string"], "AtLocation": ["workshop", "cottage"]},
+    },
+    {
+        "name": "Briarvine",
+        "categories": ["plant", "herb"],
+        "properties": ["thorny", "green", "medicinal"],
+        "relations": {"AtLocation": ["forest", "hedge"], "UsedFor": ["healing", "tea"], "HasA": ["thorn", "leaf"]},
+    },
+    {
+        "name": "Mudhog",
+        "categories": ["animal", "livestock"],
+        "properties": ["muddy", "stubborn", "heavy"],
+        "relations": {"AtLocation": ["farm", "swamp"], "Desires": ["food", "mud"], "CapableOf": ["digging", "rooting"]},
+    },
+    {
+        "name": "Frostberry",
+        "categories": ["fruit", "food"],
+        "properties": ["cold", "blue", "tiny"],
+        "relations": {"AtLocation": ["mountain", "tundra"], "UsedFor": ["eating", "preserves"], "HasProperty": ["cold", "tart"]},
+    },
+    {
+        "name": "Lanternmoss",
+        "categories": ["plant", "fungus"],
+        "properties": ["glowing", "damp", "soft"],
+        "relations": {"AtLocation": ["cave", "swamp"], "UsedFor": ["light", "decoration"], "HasProperty": ["luminous", "fragile"]},
+    },
+    {
+        "name": "Cinderhawk",
+        "categories": ["bird", "animal"],
+        "properties": ["fiery", "fast", "red"],
+        "relations": {"AtLocation": ["mountain", "volcano"], "CapableOf": ["flying", "hunting"], "HasA": ["talons", "feathers"]},
+    },
+    {
+        "name": "Rootstone",
+        "categories": ["stone", "material"],
+        "properties": ["veined", "hard", "ancient"],
+        "relations": {"AtLocation": ["quarry", "riverbed"], "UsedFor": ["building", "carving"], "MadeOf": ["mineral", "root"]},
+    },
+    {
+        "name": "Silkwort",
+        "categories": ["plant", "fiber"],
+        "properties": ["silky", "white", "tall"],
+        "relations": {"AtLocation": ["field", "meadow"], "UsedFor": ["weaving", "cloth"], "HasA": ["stem", "fiber"]},
+    },
+    {
+        "name": "Kettlefrog",
+        "categories": ["animal", "amphibian"],
+        "properties": ["loud", "round", "green"],
+        "relations": {"AtLocation": ["pond", "marsh"], "CapableOf": ["jumping", "croaking"], "Desires": ["flies", "water"]},
+    },
+    {
+        "name": "Dustwheat",
+        "categories": ["crop", "grain"],
+        "properties": ["dry", "golden", "hardy"],
+        "relations": {"AtLocation": ["field", "barn"], "UsedFor": ["bread", "flour"], "HasPrerequisite": ["rain", "soil"]},
+    },
+]
+
+
+def format_entity_description(entity):
+    """Format entity into a natural description string."""
+    name = entity["name"]
+    cats = entity.get("categories", [])
+    props = entity.get("properties", [])
+    rels = entity.get("relations", {})
+
+    parts = []
+
+    # Category description
+    if props and cats:
+        prop_str = ", ".join(props[:3])
+        cat_str = " and ".join(cats[:2])
+        parts.append(f"A {name} is a {prop_str} {cat_str}.")
+    elif cats:
+        parts.append(f"A {name} is a {' and '.join(cats[:2])}.")
+
+    # Location
+    if "AtLocation" in rels:
+        locs = rels["AtLocation"]
+        parts.append(f"It is found near {' and '.join(locs[:2])}.")
+
+    # Parts/properties
+    if "HasA" in rels:
+        has = rels["HasA"]
+        parts.append(f"It has {', '.join(has[:3])}.")
+
+    # Capabilities
+    if "CapableOf" in rels:
+        caps = rels["CapableOf"]
+        parts.append(f"It can {' and '.join(caps[:2])}.")
+
+    # Uses
+    if "UsedFor" in rels:
+        uses = rels["UsedFor"]
+        parts.append(f"It is used for {' and '.join(uses[:2])}.")
+
+    return " ".join(parts)
+
+
+def load_vocab_categories():
+    """Load vocab to get word -> categories mapping."""
+    word_cats = {}
+    vocab_path = DATA_DIR / "folksy_vocab.csv"
+    if vocab_path.exists():
+        with open(vocab_path, newline="", encoding="utf-8") as f:
+            for row in csv.DictReader(f):
+                word = row["word"]
+                cats = [c.strip() for c in row["categories"].split(",") if c.strip()]
+                word_cats[word] = cats
+    return word_cats
+
+
+def generate_training_pairs(entry, word_cats):
+    """Generate 3-5 training pairs for a single polished saying."""
+    polished = entry.get("polished_text", "")
+    slots = entry.get("slots", {})
+    meta_template = entry.get("meta_template", "")
+
+    # Collect source words (concrete nouns from slots)
+    source_words = [v for v in slots.values()
+                    if v and not v.startswith("a ") and not v.startswith("an ") and len(v) > 1]
+
+    # Determine categories of slot words
+    slot_categories = set()
+    for word in source_words:
+        word_lower = word.lower().replace(" ", "_")
+        if word_lower in word_cats:
+            slot_categories.update(word_cats[word_lower])
+
+    pairs = []
+    base = {
+        "output": polished,
+        "meta_template": meta_template,
+        "source_words": source_words,
+    }
+
+    # 1. Word-seeded (always include)
+    if source_words:
+        word = random.choice(source_words)
+        pairs.append({**base, "input": f"Tell me something about {word}."})
+
+    # 2. Category-seeded (always include if we have categories)
+    if slot_categories:
+        cat = random.choice(list(slot_categories))
+        pairs.append({**base, "input": f"Tell me a saying about {cat}."})
+
+    # 3. Persona-seeded (always include)
+    persona = random.choice(PERSONAS)
+    if source_words:
+        word = random.choice(source_words)
+        pairs.append({**base, "input": f"What would a {persona} say about {word}?"})
+
+    # 4. Template-seeded (include ~70% of the time)
+    if random.random() < 0.7:
+        template_name = TEMPLATE_NAMES.get(meta_template, meta_template)
+        pairs.append({**base, "input": f"Give me a {template_name} proverb."})
+
+    # 5. Open-ended (include ~30% of the time)
+    if random.random() < 0.3:
+        prompt = random.choice(OPEN_ENDED_PROMPTS)
+        pairs.append({**base, "input": prompt})
+
+    return pairs
+
+
+def generate_fictional_pairs(entities):
+    """Generate training pairs for fictional entities.
+
+    These pairs include the entity description in the input.
+    """
+    pairs = []
+
+    # Generate 15-25 pairs per entity
+    for entity in entities:
+        name = entity["name"]
+        desc = format_entity_description(entity)
+        props = entity.get("properties", [])
+        rels = entity.get("relations", {})
+
+        # Collect words related to this entity
+        related_words = []
+        for targets in rels.values():
+            related_words.extend(targets)
+
+        n_pairs = random.randint(15, 25)
+
+        for _ in range(n_pairs):
+            framing = random.choice(["persona", "word", "category", "open"])
+
+            if framing == "persona":
+                persona = random.choice(PERSONAS)
+                input_text = f"{desc} What would a {persona} say about a {name}?"
+            elif framing == "word" and related_words:
+                word = random.choice(related_words)
+                input_text = f"{desc} Tell me a saying about {name} and {word}."
+            elif framing == "category":
+                cats = entity.get("categories", ["thing"])
+                cat = random.choice(cats)
+                input_text = f"{desc} Give me folk wisdom about this {cat}."
+            else:
+                input_text = f"{desc} Tell me some folk wisdom about {name}."
+
+            # Placeholder output — these would ideally be generated through the
+            # template engine with fictional entities loaded, then polished.
+            # For now, generate a structural placeholder that indicates the
+            # entity relationships.
+            pairs.append({
+                "input": input_text,
+                "output": "",  # Will be filled by actual generation
+                "meta_template": "fictional",
+                "source_words": [name] + related_words[:3],
+                "_needs_generation": True,
+                "_entity": entity,
+            })
+
+    return pairs
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Format training pairs for fine-tuning.")
+    parser.add_argument("--input", default=str(CORPUS_DIR / "corpus_filtered.jsonl"),
+                        help="Input filtered JSONL file")
+    parser.add_argument("--output", default=str(CORPUS_DIR / "training_pairs.jsonl"),
+                        help="Output training pairs JSONL file")
+    parser.add_argument("--entities", default=str(EXAMPLES_DIR / "my_world.json"),
+                        help="Fictional entities JSON file")
+    args = parser.parse_args()
+
+    input_path = Path(args.input)
+    output_path = Path(args.output)
+    entities_path = Path(args.entities)
+
+    if not input_path.exists():
+        print(f"Error: {input_path} not found.", file=sys.stderr)
+        sys.exit(1)
+
+    # Load vocab categories
+    word_cats = load_vocab_categories()
+
+    # Load filtered entries
+    entries = []
+    with open(input_path, encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if line:
+                entries.append(json.loads(line))
+
+    print(f"Loaded {len(entries)} filtered entries")
+
+    # Generate training pairs for each entry
+    all_pairs = []
+    for entry in entries:
+        pairs = generate_training_pairs(entry, word_cats)
+        all_pairs.extend(pairs)
+
+    print(f"Generated {len(all_pairs)} training pairs from polished sayings")
+
+    # Generate fictional entity pairs
+    fictional_entities = []
+    if entities_path.exists():
+        with open(entities_path, encoding="utf-8") as f:
+            data = json.load(f)
+            fictional_entities = data.get("entities", [])
+        print(f"Loaded {len(fictional_entities)} fictional entities from {entities_path}")
+
+    # Add auto-generated entities
+    fictional_entities.extend(AUTO_ENTITIES)
+    print(f"Total fictional entities (file + auto-generated): {len(fictional_entities)}")
+
+    fictional_pairs = generate_fictional_pairs(fictional_entities)
+
+    # Filter out placeholder pairs (those that still need generation)
+    # In a full pipeline, these would be generated through the template engine.
+    # For now, skip any with empty output.
+    real_fictional = [p for p in fictional_pairs if p.get("output")]
+    placeholder_fictional = [p for p in fictional_pairs if not p.get("output")]
+
+    if placeholder_fictional:
+        print(f"  {len(placeholder_fictional)} fictional pairs need generation via template engine")
+        print(f"  (Run folksy_generator.py with --entities to generate these, then re-run this script)")
+
+    all_pairs.extend(real_fictional)
+
+    # Clean up internal fields before writing
+    for pair in all_pairs:
+        pair.pop("_needs_generation", None)
+        pair.pop("_entity", None)
+
+    # Write output
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(output_path, "w", encoding="utf-8") as f:
+        for pair in all_pairs:
+            f.write(json.dumps(pair, ensure_ascii=False) + "\n")
+
+    # Stats
+    from collections import Counter
+    input_types = Counter()
+    for pair in all_pairs:
+        inp = pair["input"]
+        if inp.startswith("Tell me something about"):
+            input_types["word_seeded"] += 1
+        elif inp.startswith("Tell me a saying about"):
+            input_types["category_seeded"] += 1
+        elif inp.startswith("What would a"):
+            input_types["persona_seeded"] += 1
+        elif inp.startswith("Give me a") and "proverb" in inp:
+            input_types["template_seeded"] += 1
+        elif any(inp.startswith(p) for p in ["Tell me some folk", "What do they", "Give me a proverb", "Share some", "What's a good"]):
+            input_types["open_ended"] += 1
+        else:
+            input_types["fictional"] += 1
+
+    print(f"\nTotal training pairs: {len(all_pairs)}")
+    print("Distribution by input type:")
+    for itype, count in sorted(input_types.items()):
+        print(f"  {itype:20s} {count:5d}")
+
+    print(f"\nOutput: {output_path}")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/generate_raw_batch.sh
+++ b/scripts/generate_raw_batch.sh
@ -0,0 +1,61 @@
+#!/usr/bin/env bash
+# Generate raw folksy sayings across all 7 templates.
+# Output: corpus/corpus_raw.jsonl (~10,500 entries)
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
+CORPUS_DIR="$PROJECT_DIR/corpus"
+GENERATOR="$PROJECT_DIR/folksy_generator.py"
+
+COUNT_PER_TEMPLATE=${1:-1500}
+
+mkdir -p "$CORPUS_DIR"
+
+OUTPUT="$CORPUS_DIR/corpus_raw.jsonl"
+# Clear existing file
+> "$OUTPUT"
+
+TEMPLATES=(
+    deconstruction
+    denial_of_consequences
+    ironic_deficiency
+    futile_preparation
+    hypocritical_complaint
+    tautological_wisdom
+    false_equivalence
+)
+
+echo "Generating $COUNT_PER_TEMPLATE sayings per template (${#TEMPLATES[@]} templates)..."
+echo "Output: $OUTPUT"
+
+total=0
+for template in "${TEMPLATES[@]}"; do
+    echo -n "  $template ($COUNT_PER_TEMPLATE)... "
+    before=$(wc -l < "$OUTPUT")
+    python "$GENERATOR" --template "$template" --count "$COUNT_PER_TEMPLATE" --json >> "$OUTPUT" 2>/dev/null
+    after=$(wc -l < "$OUTPUT")
+    generated=$((after - before))
+    total=$((total + generated))
+    echo "$generated generated"
+done
+
+echo ""
+echo "Total: $total raw sayings in $OUTPUT"
+echo ""
+
+# Check template distribution
+echo "Template distribution:"
+python -c "
+import json, sys
+from collections import Counter
+counts = Counter()
+with open('$OUTPUT') as f:
+    for line in f:
+        entry = json.loads(line)
+        counts[entry['meta_template']] += 1
+for template, count in sorted(counts.items()):
+    print(f'  {template:30s} {count:5d}')
+print(f\"  {'TOTAL':30s} {sum(counts.values()):5d}\")
+"
--- a/scripts/polish_corpus.py
+++ b/scripts/polish_corpus.py
@ -0,0 +1,215 @@
+#!/usr/bin/env python3
+"""LLM polish pipeline for raw folksy sayings.
+
+Reads corpus_raw.jsonl, sends each to GLM4-32B for polish.
+Output file is the checkpoint — append mode with resume detection.
+
+Usage:
+  python scripts/polish_corpus.py
+  python scripts/polish_corpus.py --input corpus/corpus_raw.jsonl --output corpus/corpus_polished.jsonl
+"""
+
+import argparse
+import json
+import sys
+import time
+from pathlib import Path
+
+SCRIPT_DIR = Path(__file__).parent
+PROJECT_DIR = SCRIPT_DIR.parent
+CORPUS_DIR = PROJECT_DIR / "corpus"
+
+LLM_ENDPOINT = "http://192.168.1.100:8853/v1d/chat/completions"
+LLM_MODEL = "THUDM-GLM4-32B"
+
+
+SYSTEM_PROMPT = """You are an editor specializing in folk sayings and rural proverbs. You will receive a rough draft of a fake folksy saying along with the relationship chain it encodes.
+
+Your job:
+1. Fix grammar, articles, and pluralization
+2. Make it sound natural — like something a weathered farmer would say while leaning on a fence post
+3. Preserve the core nouns and the relationship between them — do not swap out the key words
+4. You MAY add small colorful details (adjectives, folksy verb choices, regional flavor) but keep it concise — real proverbs are short
+5. You MAY lightly restructure the sentence for better rhythm, but keep the same meaning pattern
+6. If the saying is unsalvageable nonsense (the nouns don't relate in any meaningful way, or the combination is unintentionally offensive), respond with exactly: DISCARD
+
+Output ONLY the polished saying on a single line. No quotes, no explanation, no preamble.
+
+Examples of good polish:
+
+Raw: "Don't build the coffee and act surprised when the water show up."
+Chain: coffee MadeOf water
+Polished: Don't brew the coffee and act surprised when the water's all gone.
+
+Raw: "The chest's children always goes without hold books."
+Chain: chest UsedFor hold_books
+Polished: The bookshelf-maker's kids always end up reading off the floor.
+
+Raw: "A pineapple is just a nectarine that's got an attitude."
+Chain: pineapple IsA fruit, nectarine IsA fruit, pineapple HasProperty prickly
+Polished: A pineapple is just a peach that grew itself some armor.
+
+Raw: "You know what they say, a steel with no iron is just a harder than gold iron."
+Chain: steel MadeOf iron, steel HasProperty hard
+Polished: You know what they say — steel without the iron is just a dream of being hard.
+
+Raw: "Funny how the bamboo never has enough grow very quickly for itself."
+Chain: bamboo CapableOf grow_quickly
+Polished: DISCARD
+
+Raw: "That's just funning the canoe and praying for boiling food."
+Chain: canoe UsedFor transport, fire UsedFor boiling_food
+Polished: DISCARD"""
+
+
+def llm_chat_completion(messages, max_retries=3):
+    """Chat completion with retry logic."""
+    import requests
+
+    for attempt in range(max_retries):
+        try:
+            resp = requests.post(LLM_ENDPOINT, json={
+                "model": LLM_MODEL,
+                "messages": messages,
+            }, timeout=120)
+            resp.raise_for_status()
+            data = resp.json()
+            return data["choices"][0]["message"]["content"].strip()
+        except Exception as e:
+            wait = (2 ** attempt)
+            print(f"  LLM error (attempt {attempt+1}/{max_retries}): {e}", file=sys.stderr)
+            if attempt < max_retries - 1:
+                time.sleep(wait)
+            else:
+                return None
+
+
+def format_chain(chain_edges):
+    """Format chain_edges list into readable string for LLM context."""
+    if not chain_edges:
+        return "(no chain data)"
+    parts = []
+    for edge in chain_edges:
+        start = edge.get("start", "?")
+        rel = edge.get("relation", "?")
+        end = edge.get("end", "?")
+        weight = edge.get("weight", 0)
+        parts.append(f"{start} --{rel}--> {end} (w:{weight:.1f})")
+    return ", ".join(parts)
+
+
+def format_slots(slots):
+    """Format slots dict for LLM context."""
+    return ", ".join(f"{k}={v}" for k, v in slots.items())
+
+
+def load_already_processed(output_path):
+    """Load set of raw_text strings already processed (for resume)."""
+    processed = set()
+    if output_path.exists():
+        with open(output_path, encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    entry = json.loads(line)
+                    processed.add(entry.get("raw_text", ""))
+                except json.JSONDecodeError:
+                    continue
+    return processed
+
+
+def main():
+    parser = argparse.ArgumentParser(description="LLM polish pipeline for folksy sayings.")
+    parser.add_argument("--input", default=str(CORPUS_DIR / "corpus_raw.jsonl"),
+                        help="Input JSONL file")
+    parser.add_argument("--output", default=str(CORPUS_DIR / "corpus_polished.jsonl"),
+                        help="Output JSONL file (also serves as checkpoint)")
+    args = parser.parse_args()
+
+    input_path = Path(args.input)
+    output_path = Path(args.output)
+
+    if not input_path.exists():
+        print(f"Error: {input_path} not found.", file=sys.stderr)
+        sys.exit(1)
+
+    # Load raw entries
+    raw_entries = []
+    with open(input_path, encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if line:
+                raw_entries.append(json.loads(line))
+
+    print(f"Loaded {len(raw_entries)} raw entries from {input_path}")
+
+    # Check what's already been processed
+    already_processed = load_already_processed(output_path)
+    remaining = [e for e in raw_entries if e.get("raw_text", "") not in already_processed]
+
+    print(f"Already processed: {len(already_processed)}")
+    print(f"Remaining: {len(remaining)}")
+
+    if not remaining:
+        print("Nothing to process.")
+        return
+
+    discards = 0
+    polished = 0
+    errors = 0
+
+    with open(output_path, "a", encoding="utf-8") as out:
+        for i, entry in enumerate(remaining):
+            raw_text = entry.get("raw_text", "")
+            meta_template = entry.get("meta_template", "")
+            chain = format_chain(entry.get("chain", []))
+            slots = format_slots(entry.get("slots", {}))
+
+            user_prompt = (
+                f"Meta-template: {meta_template}\n"
+                f"Relationship chain: {chain}\n"
+                f"Slot fills: {slots}\n"
+                f"Raw saying: {raw_text}"
+            )
+
+            messages = [
+                {"role": "system", "content": SYSTEM_PROMPT},
+                {"role": "user", "content": user_prompt},
+            ]
+
+            response = llm_chat_completion(messages)
+
+            if response is None:
+                entry["status"] = "error"
+                errors += 1
+            elif response.strip().upper() == "DISCARD":
+                entry["status"] = "discarded"
+                discards += 1
+            else:
+                entry["polished_text"] = response.strip()
+                entry["status"] = "polished"
+                polished += 1
+
+            out.write(json.dumps(entry, ensure_ascii=False) + "\n")
+
+            if (i + 1) % 100 == 0:
+                out.flush()
+                total_done = len(already_processed) + i + 1
+                print(f"  [{total_done}/{len(raw_entries)}] "
+                      f"polished={polished}, discarded={discards}, errors={errors}")
+
+            time.sleep(0.1)
+
+    total_done = len(already_processed) + len(remaining)
+    print(f"\nDone: {total_done} total entries processed.")
+    print(f"  Polished: {polished}")
+    print(f"  Discarded: {discards}")
+    print(f"  Errors: {errors}")
+    print(f"  Discard rate: {discards/(polished+discards)*100:.1f}%" if (polished+discards) else "  N/A")
+    print(f"Output: {output_path}")
+
+
+if __name__ == "__main__":
+    main()