corpus generation (work from mid february)
This commit is contained in:
parent
8c8a058301
commit
356b62c6ea
16 changed files with 25872 additions and 38 deletions
385
scripts/format_training_pairs.py
Normal file
385
scripts/format_training_pairs.py
Normal file
|
|
@ -0,0 +1,385 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Format filtered sayings into training pairs for fine-tuning.
|
||||
|
||||
Each polished saying generates 3-5 training pairs with different input framings.
|
||||
Also generates fictional entity training pairs.
|
||||
|
||||
Usage:
|
||||
python scripts/format_training_pairs.py
|
||||
python scripts/format_training_pairs.py --input corpus/corpus_filtered.jsonl --output corpus/training_pairs.jsonl
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import json
|
||||
import random
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
SCRIPT_DIR = Path(__file__).parent
|
||||
PROJECT_DIR = SCRIPT_DIR.parent
|
||||
CORPUS_DIR = PROJECT_DIR / "corpus"
|
||||
DATA_DIR = PROJECT_DIR / "data"
|
||||
EXAMPLES_DIR = PROJECT_DIR / "examples"
|
||||
|
||||
# Template name mappings for human-readable prompts
|
||||
TEMPLATE_NAMES = {
|
||||
"deconstruction": "deconstruction",
|
||||
"denial_of_consequences": "denial of consequences",
|
||||
"ironic_deficiency": "ironic deficiency",
|
||||
"futile_preparation": "futile preparation",
|
||||
"hypocritical_complaint": "hypocritical complaint",
|
||||
"tautological_wisdom": "tautological wisdom",
|
||||
"false_equivalence": "false equivalence",
|
||||
}
|
||||
|
||||
PERSONAS = ["farmer", "grandmother", "old sailor", "blacksmith", "innkeeper", "shepherd"]
|
||||
|
||||
OPEN_ENDED_PROMPTS = [
|
||||
"Tell me some folk wisdom.",
|
||||
"What do they say?",
|
||||
"Give me a proverb.",
|
||||
"Share some old-time wisdom.",
|
||||
"What's a good saying?",
|
||||
]
|
||||
|
||||
# Auto-generated fictional entities for additional training pairs
|
||||
AUTO_ENTITIES = [
|
||||
{
|
||||
"name": "Stoneclaw",
|
||||
"categories": ["animal", "predator"],
|
||||
"properties": ["fierce", "rocky", "nocturnal"],
|
||||
"relations": {"AtLocation": ["cave", "mountain"], "HasA": ["claws", "scales"], "CapableOf": ["hunting", "climbing"]},
|
||||
},
|
||||
{
|
||||
"name": "Duskmelon",
|
||||
"categories": ["fruit", "food"],
|
||||
"properties": ["purple", "sweet", "fragrant"],
|
||||
"relations": {"AtLocation": ["garden", "market"], "UsedFor": ["eating", "jam"], "MadeOf": ["seed", "juice"]},
|
||||
},
|
||||
{
|
||||
"name": "Windloom",
|
||||
"categories": ["tool", "craft"],
|
||||
"properties": ["wooden", "portable", "intricate"],
|
||||
"relations": {"UsedFor": ["weaving", "thread"], "MadeOf": ["wood", "string"], "AtLocation": ["workshop", "cottage"]},
|
||||
},
|
||||
{
|
||||
"name": "Briarvine",
|
||||
"categories": ["plant", "herb"],
|
||||
"properties": ["thorny", "green", "medicinal"],
|
||||
"relations": {"AtLocation": ["forest", "hedge"], "UsedFor": ["healing", "tea"], "HasA": ["thorn", "leaf"]},
|
||||
},
|
||||
{
|
||||
"name": "Mudhog",
|
||||
"categories": ["animal", "livestock"],
|
||||
"properties": ["muddy", "stubborn", "heavy"],
|
||||
"relations": {"AtLocation": ["farm", "swamp"], "Desires": ["food", "mud"], "CapableOf": ["digging", "rooting"]},
|
||||
},
|
||||
{
|
||||
"name": "Frostberry",
|
||||
"categories": ["fruit", "food"],
|
||||
"properties": ["cold", "blue", "tiny"],
|
||||
"relations": {"AtLocation": ["mountain", "tundra"], "UsedFor": ["eating", "preserves"], "HasProperty": ["cold", "tart"]},
|
||||
},
|
||||
{
|
||||
"name": "Lanternmoss",
|
||||
"categories": ["plant", "fungus"],
|
||||
"properties": ["glowing", "damp", "soft"],
|
||||
"relations": {"AtLocation": ["cave", "swamp"], "UsedFor": ["light", "decoration"], "HasProperty": ["luminous", "fragile"]},
|
||||
},
|
||||
{
|
||||
"name": "Cinderhawk",
|
||||
"categories": ["bird", "animal"],
|
||||
"properties": ["fiery", "fast", "red"],
|
||||
"relations": {"AtLocation": ["mountain", "volcano"], "CapableOf": ["flying", "hunting"], "HasA": ["talons", "feathers"]},
|
||||
},
|
||||
{
|
||||
"name": "Rootstone",
|
||||
"categories": ["stone", "material"],
|
||||
"properties": ["veined", "hard", "ancient"],
|
||||
"relations": {"AtLocation": ["quarry", "riverbed"], "UsedFor": ["building", "carving"], "MadeOf": ["mineral", "root"]},
|
||||
},
|
||||
{
|
||||
"name": "Silkwort",
|
||||
"categories": ["plant", "fiber"],
|
||||
"properties": ["silky", "white", "tall"],
|
||||
"relations": {"AtLocation": ["field", "meadow"], "UsedFor": ["weaving", "cloth"], "HasA": ["stem", "fiber"]},
|
||||
},
|
||||
{
|
||||
"name": "Kettlefrog",
|
||||
"categories": ["animal", "amphibian"],
|
||||
"properties": ["loud", "round", "green"],
|
||||
"relations": {"AtLocation": ["pond", "marsh"], "CapableOf": ["jumping", "croaking"], "Desires": ["flies", "water"]},
|
||||
},
|
||||
{
|
||||
"name": "Dustwheat",
|
||||
"categories": ["crop", "grain"],
|
||||
"properties": ["dry", "golden", "hardy"],
|
||||
"relations": {"AtLocation": ["field", "barn"], "UsedFor": ["bread", "flour"], "HasPrerequisite": ["rain", "soil"]},
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
def format_entity_description(entity):
|
||||
"""Format entity into a natural description string."""
|
||||
name = entity["name"]
|
||||
cats = entity.get("categories", [])
|
||||
props = entity.get("properties", [])
|
||||
rels = entity.get("relations", {})
|
||||
|
||||
parts = []
|
||||
|
||||
# Category description
|
||||
if props and cats:
|
||||
prop_str = ", ".join(props[:3])
|
||||
cat_str = " and ".join(cats[:2])
|
||||
parts.append(f"A {name} is a {prop_str} {cat_str}.")
|
||||
elif cats:
|
||||
parts.append(f"A {name} is a {' and '.join(cats[:2])}.")
|
||||
|
||||
# Location
|
||||
if "AtLocation" in rels:
|
||||
locs = rels["AtLocation"]
|
||||
parts.append(f"It is found near {' and '.join(locs[:2])}.")
|
||||
|
||||
# Parts/properties
|
||||
if "HasA" in rels:
|
||||
has = rels["HasA"]
|
||||
parts.append(f"It has {', '.join(has[:3])}.")
|
||||
|
||||
# Capabilities
|
||||
if "CapableOf" in rels:
|
||||
caps = rels["CapableOf"]
|
||||
parts.append(f"It can {' and '.join(caps[:2])}.")
|
||||
|
||||
# Uses
|
||||
if "UsedFor" in rels:
|
||||
uses = rels["UsedFor"]
|
||||
parts.append(f"It is used for {' and '.join(uses[:2])}.")
|
||||
|
||||
return " ".join(parts)
|
||||
|
||||
|
||||
def load_vocab_categories():
|
||||
"""Load vocab to get word -> categories mapping."""
|
||||
word_cats = {}
|
||||
vocab_path = DATA_DIR / "folksy_vocab.csv"
|
||||
if vocab_path.exists():
|
||||
with open(vocab_path, newline="", encoding="utf-8") as f:
|
||||
for row in csv.DictReader(f):
|
||||
word = row["word"]
|
||||
cats = [c.strip() for c in row["categories"].split(",") if c.strip()]
|
||||
word_cats[word] = cats
|
||||
return word_cats
|
||||
|
||||
|
||||
def generate_training_pairs(entry, word_cats):
|
||||
"""Generate 3-5 training pairs for a single polished saying."""
|
||||
polished = entry.get("polished_text", "")
|
||||
slots = entry.get("slots", {})
|
||||
meta_template = entry.get("meta_template", "")
|
||||
|
||||
# Collect source words (concrete nouns from slots)
|
||||
source_words = [v for v in slots.values()
|
||||
if v and not v.startswith("a ") and not v.startswith("an ") and len(v) > 1]
|
||||
|
||||
# Determine categories of slot words
|
||||
slot_categories = set()
|
||||
for word in source_words:
|
||||
word_lower = word.lower().replace(" ", "_")
|
||||
if word_lower in word_cats:
|
||||
slot_categories.update(word_cats[word_lower])
|
||||
|
||||
pairs = []
|
||||
base = {
|
||||
"output": polished,
|
||||
"meta_template": meta_template,
|
||||
"source_words": source_words,
|
||||
}
|
||||
|
||||
# 1. Word-seeded (always include)
|
||||
if source_words:
|
||||
word = random.choice(source_words)
|
||||
pairs.append({**base, "input": f"Tell me something about {word}."})
|
||||
|
||||
# 2. Category-seeded (always include if we have categories)
|
||||
if slot_categories:
|
||||
cat = random.choice(list(slot_categories))
|
||||
pairs.append({**base, "input": f"Tell me a saying about {cat}."})
|
||||
|
||||
# 3. Persona-seeded (always include)
|
||||
persona = random.choice(PERSONAS)
|
||||
if source_words:
|
||||
word = random.choice(source_words)
|
||||
pairs.append({**base, "input": f"What would a {persona} say about {word}?"})
|
||||
|
||||
# 4. Template-seeded (include ~70% of the time)
|
||||
if random.random() < 0.7:
|
||||
template_name = TEMPLATE_NAMES.get(meta_template, meta_template)
|
||||
pairs.append({**base, "input": f"Give me a {template_name} proverb."})
|
||||
|
||||
# 5. Open-ended (include ~30% of the time)
|
||||
if random.random() < 0.3:
|
||||
prompt = random.choice(OPEN_ENDED_PROMPTS)
|
||||
pairs.append({**base, "input": prompt})
|
||||
|
||||
return pairs
|
||||
|
||||
|
||||
def generate_fictional_pairs(entities):
|
||||
"""Generate training pairs for fictional entities.
|
||||
|
||||
These pairs include the entity description in the input.
|
||||
"""
|
||||
pairs = []
|
||||
|
||||
# Generate 15-25 pairs per entity
|
||||
for entity in entities:
|
||||
name = entity["name"]
|
||||
desc = format_entity_description(entity)
|
||||
props = entity.get("properties", [])
|
||||
rels = entity.get("relations", {})
|
||||
|
||||
# Collect words related to this entity
|
||||
related_words = []
|
||||
for targets in rels.values():
|
||||
related_words.extend(targets)
|
||||
|
||||
n_pairs = random.randint(15, 25)
|
||||
|
||||
for _ in range(n_pairs):
|
||||
framing = random.choice(["persona", "word", "category", "open"])
|
||||
|
||||
if framing == "persona":
|
||||
persona = random.choice(PERSONAS)
|
||||
input_text = f"{desc} What would a {persona} say about a {name}?"
|
||||
elif framing == "word" and related_words:
|
||||
word = random.choice(related_words)
|
||||
input_text = f"{desc} Tell me a saying about {name} and {word}."
|
||||
elif framing == "category":
|
||||
cats = entity.get("categories", ["thing"])
|
||||
cat = random.choice(cats)
|
||||
input_text = f"{desc} Give me folk wisdom about this {cat}."
|
||||
else:
|
||||
input_text = f"{desc} Tell me some folk wisdom about {name}."
|
||||
|
||||
# Placeholder output — these would ideally be generated through the
|
||||
# template engine with fictional entities loaded, then polished.
|
||||
# For now, generate a structural placeholder that indicates the
|
||||
# entity relationships.
|
||||
pairs.append({
|
||||
"input": input_text,
|
||||
"output": "", # Will be filled by actual generation
|
||||
"meta_template": "fictional",
|
||||
"source_words": [name] + related_words[:3],
|
||||
"_needs_generation": True,
|
||||
"_entity": entity,
|
||||
})
|
||||
|
||||
return pairs
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Format training pairs for fine-tuning.")
|
||||
parser.add_argument("--input", default=str(CORPUS_DIR / "corpus_filtered.jsonl"),
|
||||
help="Input filtered JSONL file")
|
||||
parser.add_argument("--output", default=str(CORPUS_DIR / "training_pairs.jsonl"),
|
||||
help="Output training pairs JSONL file")
|
||||
parser.add_argument("--entities", default=str(EXAMPLES_DIR / "my_world.json"),
|
||||
help="Fictional entities JSON file")
|
||||
args = parser.parse_args()
|
||||
|
||||
input_path = Path(args.input)
|
||||
output_path = Path(args.output)
|
||||
entities_path = Path(args.entities)
|
||||
|
||||
if not input_path.exists():
|
||||
print(f"Error: {input_path} not found.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Load vocab categories
|
||||
word_cats = load_vocab_categories()
|
||||
|
||||
# Load filtered entries
|
||||
entries = []
|
||||
with open(input_path, encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if line:
|
||||
entries.append(json.loads(line))
|
||||
|
||||
print(f"Loaded {len(entries)} filtered entries")
|
||||
|
||||
# Generate training pairs for each entry
|
||||
all_pairs = []
|
||||
for entry in entries:
|
||||
pairs = generate_training_pairs(entry, word_cats)
|
||||
all_pairs.extend(pairs)
|
||||
|
||||
print(f"Generated {len(all_pairs)} training pairs from polished sayings")
|
||||
|
||||
# Generate fictional entity pairs
|
||||
fictional_entities = []
|
||||
if entities_path.exists():
|
||||
with open(entities_path, encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
fictional_entities = data.get("entities", [])
|
||||
print(f"Loaded {len(fictional_entities)} fictional entities from {entities_path}")
|
||||
|
||||
# Add auto-generated entities
|
||||
fictional_entities.extend(AUTO_ENTITIES)
|
||||
print(f"Total fictional entities (file + auto-generated): {len(fictional_entities)}")
|
||||
|
||||
fictional_pairs = generate_fictional_pairs(fictional_entities)
|
||||
|
||||
# Filter out placeholder pairs (those that still need generation)
|
||||
# In a full pipeline, these would be generated through the template engine.
|
||||
# For now, skip any with empty output.
|
||||
real_fictional = [p for p in fictional_pairs if p.get("output")]
|
||||
placeholder_fictional = [p for p in fictional_pairs if not p.get("output")]
|
||||
|
||||
if placeholder_fictional:
|
||||
print(f" {len(placeholder_fictional)} fictional pairs need generation via template engine")
|
||||
print(f" (Run folksy_generator.py with --entities to generate these, then re-run this script)")
|
||||
|
||||
all_pairs.extend(real_fictional)
|
||||
|
||||
# Clean up internal fields before writing
|
||||
for pair in all_pairs:
|
||||
pair.pop("_needs_generation", None)
|
||||
pair.pop("_entity", None)
|
||||
|
||||
# Write output
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
for pair in all_pairs:
|
||||
f.write(json.dumps(pair, ensure_ascii=False) + "\n")
|
||||
|
||||
# Stats
|
||||
from collections import Counter
|
||||
input_types = Counter()
|
||||
for pair in all_pairs:
|
||||
inp = pair["input"]
|
||||
if inp.startswith("Tell me something about"):
|
||||
input_types["word_seeded"] += 1
|
||||
elif inp.startswith("Tell me a saying about"):
|
||||
input_types["category_seeded"] += 1
|
||||
elif inp.startswith("What would a"):
|
||||
input_types["persona_seeded"] += 1
|
||||
elif inp.startswith("Give me a") and "proverb" in inp:
|
||||
input_types["template_seeded"] += 1
|
||||
elif any(inp.startswith(p) for p in ["Tell me some folk", "What do they", "Give me a proverb", "Share some", "What's a good"]):
|
||||
input_types["open_ended"] += 1
|
||||
else:
|
||||
input_types["fictional"] += 1
|
||||
|
||||
print(f"\nTotal training pairs: {len(all_pairs)}")
|
||||
print("Distribution by input type:")
|
||||
for itype, count in sorted(input_types.items()):
|
||||
print(f" {itype:20s} {count:5d}")
|
||||
|
||||
print(f"\nOutput: {output_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue