corpus generation (work from mid february)

2026-03-09 19:52:09 -04:00 · 2026-03-09 19:52:09 -04:00 · 356b62c6ea
commit 356b62c6ea
parent 8c8a058301
16 changed files with 25872 additions and 38 deletions
--- a/scripts/format_training_pairs.py
+++ b/scripts/format_training_pairs.py
@ -0,0 +1,385 @@
+#!/usr/bin/env python3
+"""Format filtered sayings into training pairs for fine-tuning.
+
+Each polished saying generates 3-5 training pairs with different input framings.
+Also generates fictional entity training pairs.
+
+Usage:
+  python scripts/format_training_pairs.py
+  python scripts/format_training_pairs.py --input corpus/corpus_filtered.jsonl --output corpus/training_pairs.jsonl
+"""
+
+import argparse
+import csv
+import json
+import random
+import sys
+from pathlib import Path
+
+SCRIPT_DIR = Path(__file__).parent
+PROJECT_DIR = SCRIPT_DIR.parent
+CORPUS_DIR = PROJECT_DIR / "corpus"
+DATA_DIR = PROJECT_DIR / "data"
+EXAMPLES_DIR = PROJECT_DIR / "examples"
+
+# Template name mappings for human-readable prompts
+TEMPLATE_NAMES = {
+    "deconstruction": "deconstruction",
+    "denial_of_consequences": "denial of consequences",
+    "ironic_deficiency": "ironic deficiency",
+    "futile_preparation": "futile preparation",
+    "hypocritical_complaint": "hypocritical complaint",
+    "tautological_wisdom": "tautological wisdom",
+    "false_equivalence": "false equivalence",
+}
+
+PERSONAS = ["farmer", "grandmother", "old sailor", "blacksmith", "innkeeper", "shepherd"]
+
+OPEN_ENDED_PROMPTS = [
+    "Tell me some folk wisdom.",
+    "What do they say?",
+    "Give me a proverb.",
+    "Share some old-time wisdom.",
+    "What's a good saying?",
+]
+
+# Auto-generated fictional entities for additional training pairs
+AUTO_ENTITIES = [
+    {
+        "name": "Stoneclaw",
+        "categories": ["animal", "predator"],
+        "properties": ["fierce", "rocky", "nocturnal"],
+        "relations": {"AtLocation": ["cave", "mountain"], "HasA": ["claws", "scales"], "CapableOf": ["hunting", "climbing"]},
+    },
+    {
+        "name": "Duskmelon",
+        "categories": ["fruit", "food"],
+        "properties": ["purple", "sweet", "fragrant"],
+        "relations": {"AtLocation": ["garden", "market"], "UsedFor": ["eating", "jam"], "MadeOf": ["seed", "juice"]},
+    },
+    {
+        "name": "Windloom",
+        "categories": ["tool", "craft"],
+        "properties": ["wooden", "portable", "intricate"],
+        "relations": {"UsedFor": ["weaving", "thread"], "MadeOf": ["wood", "string"], "AtLocation": ["workshop", "cottage"]},
+    },
+    {
+        "name": "Briarvine",
+        "categories": ["plant", "herb"],
+        "properties": ["thorny", "green", "medicinal"],
+        "relations": {"AtLocation": ["forest", "hedge"], "UsedFor": ["healing", "tea"], "HasA": ["thorn", "leaf"]},
+    },
+    {
+        "name": "Mudhog",
+        "categories": ["animal", "livestock"],
+        "properties": ["muddy", "stubborn", "heavy"],
+        "relations": {"AtLocation": ["farm", "swamp"], "Desires": ["food", "mud"], "CapableOf": ["digging", "rooting"]},
+    },
+    {
+        "name": "Frostberry",
+        "categories": ["fruit", "food"],
+        "properties": ["cold", "blue", "tiny"],
+        "relations": {"AtLocation": ["mountain", "tundra"], "UsedFor": ["eating", "preserves"], "HasProperty": ["cold", "tart"]},
+    },
+    {
+        "name": "Lanternmoss",
+        "categories": ["plant", "fungus"],
+        "properties": ["glowing", "damp", "soft"],
+        "relations": {"AtLocation": ["cave", "swamp"], "UsedFor": ["light", "decoration"], "HasProperty": ["luminous", "fragile"]},
+    },
+    {
+        "name": "Cinderhawk",
+        "categories": ["bird", "animal"],
+        "properties": ["fiery", "fast", "red"],
+        "relations": {"AtLocation": ["mountain", "volcano"], "CapableOf": ["flying", "hunting"], "HasA": ["talons", "feathers"]},
+    },
+    {
+        "name": "Rootstone",
+        "categories": ["stone", "material"],
+        "properties": ["veined", "hard", "ancient"],
+        "relations": {"AtLocation": ["quarry", "riverbed"], "UsedFor": ["building", "carving"], "MadeOf": ["mineral", "root"]},
+    },
+    {
+        "name": "Silkwort",
+        "categories": ["plant", "fiber"],
+        "properties": ["silky", "white", "tall"],
+        "relations": {"AtLocation": ["field", "meadow"], "UsedFor": ["weaving", "cloth"], "HasA": ["stem", "fiber"]},
+    },
+    {
+        "name": "Kettlefrog",
+        "categories": ["animal", "amphibian"],
+        "properties": ["loud", "round", "green"],
+        "relations": {"AtLocation": ["pond", "marsh"], "CapableOf": ["jumping", "croaking"], "Desires": ["flies", "water"]},
+    },
+    {
+        "name": "Dustwheat",
+        "categories": ["crop", "grain"],
+        "properties": ["dry", "golden", "hardy"],
+        "relations": {"AtLocation": ["field", "barn"], "UsedFor": ["bread", "flour"], "HasPrerequisite": ["rain", "soil"]},
+    },
+]
+
+
+def format_entity_description(entity):
+    """Format entity into a natural description string."""
+    name = entity["name"]
+    cats = entity.get("categories", [])
+    props = entity.get("properties", [])
+    rels = entity.get("relations", {})
+
+    parts = []
+
+    # Category description
+    if props and cats:
+        prop_str = ", ".join(props[:3])
+        cat_str = " and ".join(cats[:2])
+        parts.append(f"A {name} is a {prop_str} {cat_str}.")
+    elif cats:
+        parts.append(f"A {name} is a {' and '.join(cats[:2])}.")
+
+    # Location
+    if "AtLocation" in rels:
+        locs = rels["AtLocation"]
+        parts.append(f"It is found near {' and '.join(locs[:2])}.")
+
+    # Parts/properties
+    if "HasA" in rels:
+        has = rels["HasA"]
+        parts.append(f"It has {', '.join(has[:3])}.")
+
+    # Capabilities
+    if "CapableOf" in rels:
+        caps = rels["CapableOf"]
+        parts.append(f"It can {' and '.join(caps[:2])}.")
+
+    # Uses
+    if "UsedFor" in rels:
+        uses = rels["UsedFor"]
+        parts.append(f"It is used for {' and '.join(uses[:2])}.")
+
+    return " ".join(parts)
+
+
+def load_vocab_categories():
+    """Load vocab to get word -> categories mapping."""
+    word_cats = {}
+    vocab_path = DATA_DIR / "folksy_vocab.csv"
+    if vocab_path.exists():
+        with open(vocab_path, newline="", encoding="utf-8") as f:
+            for row in csv.DictReader(f):
+                word = row["word"]
+                cats = [c.strip() for c in row["categories"].split(",") if c.strip()]
+                word_cats[word] = cats
+    return word_cats
+
+
+def generate_training_pairs(entry, word_cats):
+    """Generate 3-5 training pairs for a single polished saying."""
+    polished = entry.get("polished_text", "")
+    slots = entry.get("slots", {})
+    meta_template = entry.get("meta_template", "")
+
+    # Collect source words (concrete nouns from slots)
+    source_words = [v for v in slots.values()
+                    if v and not v.startswith("a ") and not v.startswith("an ") and len(v) > 1]
+
+    # Determine categories of slot words
+    slot_categories = set()
+    for word in source_words:
+        word_lower = word.lower().replace(" ", "_")
+        if word_lower in word_cats:
+            slot_categories.update(word_cats[word_lower])
+
+    pairs = []
+    base = {
+        "output": polished,
+        "meta_template": meta_template,
+        "source_words": source_words,
+    }
+
+    # 1. Word-seeded (always include)
+    if source_words:
+        word = random.choice(source_words)
+        pairs.append({**base, "input": f"Tell me something about {word}."})
+
+    # 2. Category-seeded (always include if we have categories)
+    if slot_categories:
+        cat = random.choice(list(slot_categories))
+        pairs.append({**base, "input": f"Tell me a saying about {cat}."})
+
+    # 3. Persona-seeded (always include)
+    persona = random.choice(PERSONAS)
+    if source_words:
+        word = random.choice(source_words)
+        pairs.append({**base, "input": f"What would a {persona} say about {word}?"})
+
+    # 4. Template-seeded (include ~70% of the time)
+    if random.random() < 0.7:
+        template_name = TEMPLATE_NAMES.get(meta_template, meta_template)
+        pairs.append({**base, "input": f"Give me a {template_name} proverb."})
+
+    # 5. Open-ended (include ~30% of the time)
+    if random.random() < 0.3:
+        prompt = random.choice(OPEN_ENDED_PROMPTS)
+        pairs.append({**base, "input": prompt})
+
+    return pairs
+
+
+def generate_fictional_pairs(entities):
+    """Generate training pairs for fictional entities.
+
+    These pairs include the entity description in the input.
+    """
+    pairs = []
+
+    # Generate 15-25 pairs per entity
+    for entity in entities:
+        name = entity["name"]
+        desc = format_entity_description(entity)
+        props = entity.get("properties", [])
+        rels = entity.get("relations", {})
+
+        # Collect words related to this entity
+        related_words = []
+        for targets in rels.values():
+            related_words.extend(targets)
+
+        n_pairs = random.randint(15, 25)
+
+        for _ in range(n_pairs):
+            framing = random.choice(["persona", "word", "category", "open"])
+
+            if framing == "persona":
+                persona = random.choice(PERSONAS)
+                input_text = f"{desc} What would a {persona} say about a {name}?"
+            elif framing == "word" and related_words:
+                word = random.choice(related_words)
+                input_text = f"{desc} Tell me a saying about {name} and {word}."
+            elif framing == "category":
+                cats = entity.get("categories", ["thing"])
+                cat = random.choice(cats)
+                input_text = f"{desc} Give me folk wisdom about this {cat}."
+            else:
+                input_text = f"{desc} Tell me some folk wisdom about {name}."
+
+            # Placeholder output — these would ideally be generated through the
+            # template engine with fictional entities loaded, then polished.
+            # For now, generate a structural placeholder that indicates the
+            # entity relationships.
+            pairs.append({
+                "input": input_text,
+                "output": "",  # Will be filled by actual generation
+                "meta_template": "fictional",
+                "source_words": [name] + related_words[:3],
+                "_needs_generation": True,
+                "_entity": entity,
+            })
+
+    return pairs
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Format training pairs for fine-tuning.")
+    parser.add_argument("--input", default=str(CORPUS_DIR / "corpus_filtered.jsonl"),
+                        help="Input filtered JSONL file")
+    parser.add_argument("--output", default=str(CORPUS_DIR / "training_pairs.jsonl"),
+                        help="Output training pairs JSONL file")
+    parser.add_argument("--entities", default=str(EXAMPLES_DIR / "my_world.json"),
+                        help="Fictional entities JSON file")
+    args = parser.parse_args()
+
+    input_path = Path(args.input)
+    output_path = Path(args.output)
+    entities_path = Path(args.entities)
+
+    if not input_path.exists():
+        print(f"Error: {input_path} not found.", file=sys.stderr)
+        sys.exit(1)
+
+    # Load vocab categories
+    word_cats = load_vocab_categories()
+
+    # Load filtered entries
+    entries = []
+    with open(input_path, encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if line:
+                entries.append(json.loads(line))
+
+    print(f"Loaded {len(entries)} filtered entries")
+
+    # Generate training pairs for each entry
+    all_pairs = []
+    for entry in entries:
+        pairs = generate_training_pairs(entry, word_cats)
+        all_pairs.extend(pairs)
+
+    print(f"Generated {len(all_pairs)} training pairs from polished sayings")
+
+    # Generate fictional entity pairs
+    fictional_entities = []
+    if entities_path.exists():
+        with open(entities_path, encoding="utf-8") as f:
+            data = json.load(f)
+            fictional_entities = data.get("entities", [])
+        print(f"Loaded {len(fictional_entities)} fictional entities from {entities_path}")
+
+    # Add auto-generated entities
+    fictional_entities.extend(AUTO_ENTITIES)
+    print(f"Total fictional entities (file + auto-generated): {len(fictional_entities)}")
+
+    fictional_pairs = generate_fictional_pairs(fictional_entities)
+
+    # Filter out placeholder pairs (those that still need generation)
+    # In a full pipeline, these would be generated through the template engine.
+    # For now, skip any with empty output.
+    real_fictional = [p for p in fictional_pairs if p.get("output")]
+    placeholder_fictional = [p for p in fictional_pairs if not p.get("output")]
+
+    if placeholder_fictional:
+        print(f"  {len(placeholder_fictional)} fictional pairs need generation via template engine")
+        print(f"  (Run folksy_generator.py with --entities to generate these, then re-run this script)")
+
+    all_pairs.extend(real_fictional)
+
+    # Clean up internal fields before writing
+    for pair in all_pairs:
+        pair.pop("_needs_generation", None)
+        pair.pop("_entity", None)
+
+    # Write output
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(output_path, "w", encoding="utf-8") as f:
+        for pair in all_pairs:
+            f.write(json.dumps(pair, ensure_ascii=False) + "\n")
+
+    # Stats
+    from collections import Counter
+    input_types = Counter()
+    for pair in all_pairs:
+        inp = pair["input"]
+        if inp.startswith("Tell me something about"):
+            input_types["word_seeded"] += 1
+        elif inp.startswith("Tell me a saying about"):
+            input_types["category_seeded"] += 1
+        elif inp.startswith("What would a"):
+            input_types["persona_seeded"] += 1
+        elif inp.startswith("Give me a") and "proverb" in inp:
+            input_types["template_seeded"] += 1
+        elif any(inp.startswith(p) for p in ["Tell me some folk", "What do they", "Give me a proverb", "Share some", "What's a good"]):
+            input_types["open_ended"] += 1
+        else:
+            input_types["fictional"] += 1
+
+    print(f"\nTotal training pairs: {len(all_pairs)}")
+    print("Distribution by input type:")
+    for itype, count in sorted(input_types.items()):
+        print(f"  {itype:20s} {count:5d}")
+
+    print(f"\nOutput: {output_path}")
+
+
+if __name__ == "__main__":
+    main()