#!/usr/bin/env python3 """Format filtered sayings into training pairs for fine-tuning. Each polished saying generates 3-5 training pairs with different input framings. Also generates fictional entity training pairs. Usage: python scripts/format_training_pairs.py python scripts/format_training_pairs.py --input corpus/corpus_filtered.jsonl --output corpus/training_pairs.jsonl """ import argparse import csv import json import random import sys from pathlib import Path SCRIPT_DIR = Path(__file__).parent PROJECT_DIR = SCRIPT_DIR.parent CORPUS_DIR = PROJECT_DIR / "corpus" DATA_DIR = PROJECT_DIR / "data" EXAMPLES_DIR = PROJECT_DIR / "examples" # Template name mappings for human-readable prompts TEMPLATE_NAMES = { "deconstruction": "deconstruction", "denial_of_consequences": "denial of consequences", "ironic_deficiency": "ironic deficiency", "futile_preparation": "futile preparation", "hypocritical_complaint": "hypocritical complaint", "tautological_wisdom": "tautological wisdom", "false_equivalence": "false equivalence", } PERSONAS = ["farmer", "grandmother", "old sailor", "blacksmith", "innkeeper", "shepherd"] OPEN_ENDED_PROMPTS = [ "Tell me some folk wisdom.", "What do they say?", "Give me a proverb.", "Share some old-time wisdom.", "What's a good saying?", ] # Auto-generated fictional entities for additional training pairs AUTO_ENTITIES = [ { "name": "Stoneclaw", "categories": ["animal", "predator"], "properties": ["fierce", "rocky", "nocturnal"], "relations": {"AtLocation": ["cave", "mountain"], "HasA": ["claws", "scales"], "CapableOf": ["hunting", "climbing"]}, }, { "name": "Duskmelon", "categories": ["fruit", "food"], "properties": ["purple", "sweet", "fragrant"], "relations": {"AtLocation": ["garden", "market"], "UsedFor": ["eating", "jam"], "MadeOf": ["seed", "juice"]}, }, { "name": "Windloom", "categories": ["tool", "craft"], "properties": ["wooden", "portable", "intricate"], "relations": {"UsedFor": ["weaving", "thread"], "MadeOf": ["wood", "string"], "AtLocation": ["workshop", "cottage"]}, }, { "name": "Briarvine", "categories": ["plant", "herb"], "properties": ["thorny", "green", "medicinal"], "relations": {"AtLocation": ["forest", "hedge"], "UsedFor": ["healing", "tea"], "HasA": ["thorn", "leaf"]}, }, { "name": "Mudhog", "categories": ["animal", "livestock"], "properties": ["muddy", "stubborn", "heavy"], "relations": {"AtLocation": ["farm", "swamp"], "Desires": ["food", "mud"], "CapableOf": ["digging", "rooting"]}, }, { "name": "Frostberry", "categories": ["fruit", "food"], "properties": ["cold", "blue", "tiny"], "relations": {"AtLocation": ["mountain", "tundra"], "UsedFor": ["eating", "preserves"], "HasProperty": ["cold", "tart"]}, }, { "name": "Lanternmoss", "categories": ["plant", "fungus"], "properties": ["glowing", "damp", "soft"], "relations": {"AtLocation": ["cave", "swamp"], "UsedFor": ["light", "decoration"], "HasProperty": ["luminous", "fragile"]}, }, { "name": "Cinderhawk", "categories": ["bird", "animal"], "properties": ["fiery", "fast", "red"], "relations": {"AtLocation": ["mountain", "volcano"], "CapableOf": ["flying", "hunting"], "HasA": ["talons", "feathers"]}, }, { "name": "Rootstone", "categories": ["stone", "material"], "properties": ["veined", "hard", "ancient"], "relations": {"AtLocation": ["quarry", "riverbed"], "UsedFor": ["building", "carving"], "MadeOf": ["mineral", "root"]}, }, { "name": "Silkwort", "categories": ["plant", "fiber"], "properties": ["silky", "white", "tall"], "relations": {"AtLocation": ["field", "meadow"], "UsedFor": ["weaving", "cloth"], "HasA": ["stem", "fiber"]}, }, { "name": "Kettlefrog", "categories": ["animal", "amphibian"], "properties": ["loud", "round", "green"], "relations": {"AtLocation": ["pond", "marsh"], "CapableOf": ["jumping", "croaking"], "Desires": ["flies", "water"]}, }, { "name": "Dustwheat", "categories": ["crop", "grain"], "properties": ["dry", "golden", "hardy"], "relations": {"AtLocation": ["field", "barn"], "UsedFor": ["bread", "flour"], "HasPrerequisite": ["rain", "soil"]}, }, ] def format_entity_description(entity): """Format entity into a natural description string.""" name = entity["name"] cats = entity.get("categories", []) props = entity.get("properties", []) rels = entity.get("relations", {}) parts = [] # Category description if props and cats: prop_str = ", ".join(props[:3]) cat_str = " and ".join(cats[:2]) parts.append(f"A {name} is a {prop_str} {cat_str}.") elif cats: parts.append(f"A {name} is a {' and '.join(cats[:2])}.") # Location if "AtLocation" in rels: locs = rels["AtLocation"] parts.append(f"It is found near {' and '.join(locs[:2])}.") # Parts/properties if "HasA" in rels: has = rels["HasA"] parts.append(f"It has {', '.join(has[:3])}.") # Capabilities if "CapableOf" in rels: caps = rels["CapableOf"] parts.append(f"It can {' and '.join(caps[:2])}.") # Uses if "UsedFor" in rels: uses = rels["UsedFor"] parts.append(f"It is used for {' and '.join(uses[:2])}.") return " ".join(parts) def load_vocab_categories(): """Load vocab to get word -> categories mapping.""" word_cats = {} vocab_path = DATA_DIR / "folksy_vocab.csv" if vocab_path.exists(): with open(vocab_path, newline="", encoding="utf-8") as f: for row in csv.DictReader(f): word = row["word"] cats = [c.strip() for c in row["categories"].split(",") if c.strip()] word_cats[word] = cats return word_cats def generate_training_pairs(entry, word_cats): """Generate 3-5 training pairs for a single polished saying.""" polished = entry.get("polished_text", "") slots = entry.get("slots", {}) meta_template = entry.get("meta_template", "") # Collect source words (concrete nouns from slots) source_words = [v for v in slots.values() if v and not v.startswith("a ") and not v.startswith("an ") and len(v) > 1] # Determine categories of slot words slot_categories = set() for word in source_words: word_lower = word.lower().replace(" ", "_") if word_lower in word_cats: slot_categories.update(word_cats[word_lower]) pairs = [] base = { "output": polished, "meta_template": meta_template, "source_words": source_words, } # 1. Word-seeded (always include) if source_words: word = random.choice(source_words) pairs.append({**base, "input": f"Tell me something about {word}."}) # 2. Category-seeded (always include if we have categories) if slot_categories: cat = random.choice(list(slot_categories)) pairs.append({**base, "input": f"Tell me a saying about {cat}."}) # 3. Persona-seeded (always include) persona = random.choice(PERSONAS) if source_words: word = random.choice(source_words) pairs.append({**base, "input": f"What would a {persona} say about {word}?"}) # 4. Template-seeded (include ~70% of the time) if random.random() < 0.7: template_name = TEMPLATE_NAMES.get(meta_template, meta_template) pairs.append({**base, "input": f"Give me a {template_name} proverb."}) # 5. Open-ended (include ~30% of the time) if random.random() < 0.3: prompt = random.choice(OPEN_ENDED_PROMPTS) pairs.append({**base, "input": prompt}) return pairs def generate_fictional_pairs(entities): """Generate training pairs for fictional entities. These pairs include the entity description in the input. """ pairs = [] # Generate 15-25 pairs per entity for entity in entities: name = entity["name"] desc = format_entity_description(entity) props = entity.get("properties", []) rels = entity.get("relations", {}) # Collect words related to this entity related_words = [] for targets in rels.values(): related_words.extend(targets) n_pairs = random.randint(15, 25) for _ in range(n_pairs): framing = random.choice(["persona", "word", "category", "open"]) if framing == "persona": persona = random.choice(PERSONAS) input_text = f"{desc} What would a {persona} say about a {name}?" elif framing == "word" and related_words: word = random.choice(related_words) input_text = f"{desc} Tell me a saying about {name} and {word}." elif framing == "category": cats = entity.get("categories", ["thing"]) cat = random.choice(cats) input_text = f"{desc} Give me folk wisdom about this {cat}." else: input_text = f"{desc} Tell me some folk wisdom about {name}." # Placeholder output — these would ideally be generated through the # template engine with fictional entities loaded, then polished. # For now, generate a structural placeholder that indicates the # entity relationships. pairs.append({ "input": input_text, "output": "", # Will be filled by actual generation "meta_template": "fictional", "source_words": [name] + related_words[:3], "_needs_generation": True, "_entity": entity, }) return pairs def main(): parser = argparse.ArgumentParser(description="Format training pairs for fine-tuning.") parser.add_argument("--input", default=str(CORPUS_DIR / "corpus_filtered.jsonl"), help="Input filtered JSONL file") parser.add_argument("--output", default=str(CORPUS_DIR / "training_pairs.jsonl"), help="Output training pairs JSONL file") parser.add_argument("--entities", default=str(EXAMPLES_DIR / "my_world.json"), help="Fictional entities JSON file") args = parser.parse_args() input_path = Path(args.input) output_path = Path(args.output) entities_path = Path(args.entities) if not input_path.exists(): print(f"Error: {input_path} not found.", file=sys.stderr) sys.exit(1) # Load vocab categories word_cats = load_vocab_categories() # Load filtered entries entries = [] with open(input_path, encoding="utf-8") as f: for line in f: line = line.strip() if line: entries.append(json.loads(line)) print(f"Loaded {len(entries)} filtered entries") # Generate training pairs for each entry all_pairs = [] for entry in entries: pairs = generate_training_pairs(entry, word_cats) all_pairs.extend(pairs) print(f"Generated {len(all_pairs)} training pairs from polished sayings") # Generate fictional entity pairs fictional_entities = [] if entities_path.exists(): with open(entities_path, encoding="utf-8") as f: data = json.load(f) fictional_entities = data.get("entities", []) print(f"Loaded {len(fictional_entities)} fictional entities from {entities_path}") # Add auto-generated entities fictional_entities.extend(AUTO_ENTITIES) print(f"Total fictional entities (file + auto-generated): {len(fictional_entities)}") fictional_pairs = generate_fictional_pairs(fictional_entities) # Filter out placeholder pairs (those that still need generation) # In a full pipeline, these would be generated through the template engine. # For now, skip any with empty output. real_fictional = [p for p in fictional_pairs if p.get("output")] placeholder_fictional = [p for p in fictional_pairs if not p.get("output")] if placeholder_fictional: print(f" {len(placeholder_fictional)} fictional pairs need generation via template engine") print(f" (Run folksy_generator.py with --entities to generate these, then re-run this script)") all_pairs.extend(real_fictional) # Clean up internal fields before writing for pair in all_pairs: pair.pop("_needs_generation", None) pair.pop("_entity", None) # Write output output_path.parent.mkdir(parents=True, exist_ok=True) with open(output_path, "w", encoding="utf-8") as f: for pair in all_pairs: f.write(json.dumps(pair, ensure_ascii=False) + "\n") # Stats from collections import Counter input_types = Counter() for pair in all_pairs: inp = pair["input"] if inp.startswith("Tell me something about"): input_types["word_seeded"] += 1 elif inp.startswith("Tell me a saying about"): input_types["category_seeded"] += 1 elif inp.startswith("What would a"): input_types["persona_seeded"] += 1 elif inp.startswith("Give me a") and "proverb" in inp: input_types["template_seeded"] += 1 elif any(inp.startswith(p) for p in ["Tell me some folk", "What do they", "Give me a proverb", "Share some", "What's a good"]): input_types["open_ended"] += 1 else: input_types["fictional"] += 1 print(f"\nTotal training pairs: {len(all_pairs)}") print("Distribution by input type:") for itype, count in sorted(input_types.items()): print(f" {itype:20s} {count:5d}") print(f"\nOutput: {output_path}") if __name__ == "__main__": main()