#!/usr/bin/env python3 """Folksy Idiom Generator — Procedural fake-proverb generator using ConceptNet relationships.""" import argparse import csv import json import os import random import sys from collections import defaultdict from pathlib import Path DATA_DIR = Path(__file__).parent / "data" # --------------------------------------------------------------------------- # Graph data structures # --------------------------------------------------------------------------- class FolksyGraph: """In-memory graph of folksy vocabulary and their ConceptNet relationships.""" def __init__(self): self.vocab = {} # word -> {categories, tangibility, edge_count} self.by_category = defaultdict(list) # category -> [words] self.edges = defaultdict(list) # (start, relation) -> [(end, weight, surface)] self.reverse = defaultdict(list) # (end, relation) -> [(start, weight, surface)] self.all_edges = defaultdict(list) # start -> [(end, relation, weight)] self.all_words = [] def load(self, vocab_path=None, relations_path=None): vocab_path = vocab_path or (DATA_DIR / "folksy_vocab.csv") relations_path = relations_path or (DATA_DIR / "folksy_relations.csv") with open(vocab_path, newline="", encoding="utf-8") as f: reader = csv.DictReader(f) for row in reader: word = row["word"] cats = [c.strip() for c in row["categories"].split(",") if c.strip()] self.vocab[word] = { "categories": cats, "tangibility": float(row.get("tangibility_score", 0)), "edge_count": int(row.get("conceptnet_edge_count", 0)), } for cat in cats: self.by_category[cat].append(word) self.all_words = list(self.vocab.keys()) with open(relations_path, newline="", encoding="utf-8") as f: reader = csv.DictReader(f) for row in reader: sw = row["start_word"] ew = row["end_word"] rel = row["relation"] w = float(row["weight"]) surf = row.get("surface_text", "") self.edges[(sw, rel)].append((ew, w, surf)) self.reverse[(ew, rel)].append((sw, w, surf)) self.all_edges[sw].append((ew, rel, w)) self.all_edges[ew].append((sw, rel, w)) def merge_fictional(self, entities_path): """Merge fictional entities into the graph.""" with open(entities_path, encoding="utf-8") as f: data = json.load(f) for entity in data.get("entities", []): name = entity["name"].lower() cats = entity.get("categories", []) props = entity.get("properties", []) # Inherit from parents inherited_relations = defaultdict(list) for parent in entity.get("derived_from", []): parent = parent.lower() if parent in self.vocab: parent_cats = self.vocab[parent]["categories"] cats = list(set(cats + parent_cats)) # Gather all edges from parent for (sw, rel), targets in list(self.edges.items()): if sw == parent: for (ew, w, surf) in targets: inherited_relations[rel].append((ew, w, "")) for (ew, rel), sources in list(self.reverse.items()): if ew == parent: for (sw, w, surf) in sources: inherited_relations[rel].append((sw, w, "")) # Register the entity as a vocab word self.vocab[name] = { "categories": cats, "tangibility": 0.5, "edge_count": 0, } for cat in cats: self.by_category[cat].append(name) self.all_words.append(name) # Add inherited relations (lower priority) for rel, targets in inherited_relations.items(): for (target, w, surf) in targets: self.edges[(name, rel)].append((target, w, "")) self.reverse[(target, rel)].append((name, w, "")) self.all_edges[name].append((target, rel, w)) # Add explicit relations (override) for rel, targets in entity.get("relations", {}).items(): for target in targets: target_lower = target.lower() self.edges[(name, rel)].append((target_lower, 2.0, "")) self.reverse[(target_lower, rel)].append((name, 2.0, "")) self.all_edges[name].append((target_lower, rel, 2.0)) # Add properties as HasProperty edges for prop in props: self.edges[(name, "HasProperty")].append((prop.lower(), 2.0, "")) self.all_edges[name].append((prop.lower(), "HasProperty", 2.0)) def neighbors(self, word, relation=None, min_weight=0.0, vocab_only=False): """Get neighbors of a word, optionally filtered by relation type. Args: vocab_only: If True, only return neighbors that are in the folksy vocab. If False (default), return all neighbors including action phrases, properties, etc. """ if relation: return [(ew, w, s) for (ew, w, s) in self.edges.get((word, relation), []) if w >= min_weight and (not vocab_only or ew in self.vocab)] results = [] for (ew, rel, w) in self.all_edges.get(word, []): if w >= min_weight and (not vocab_only or ew in self.vocab): results.append((ew, rel, w)) return results def vocab_neighbors(self, word, relation=None, min_weight=0.0): """Get neighbors restricted to folksy vocab words only.""" return self.neighbors(word, relation, min_weight, vocab_only=True) def two_hop(self, word, rel1, rel2, min_weight=0.5): """Find 2-hop paths: word -[rel1]-> bridge -[rel2]-> target. Bridge can be any word; target must be in folksy vocab. """ results = [] for (bridge, w1, _) in self.edges.get((word, rel1), []): for (target, w2, _) in self.edges.get((bridge, rel2), []): if target != word and target in self.vocab and w2 >= min_weight: results.append((bridge, target, w1, w2)) return results def two_hop_any(self, word, rel1, rel2, min_weight=0.5): """Find 2-hop paths where target can be any word (not just vocab).""" results = [] for (bridge, w1, _) in self.edges.get((word, rel1), []): for (target, w2, _) in self.edges.get((bridge, rel2), []): if target != word and w2 >= min_weight: results.append((bridge, target, w1, w2)) return results def random_word(self, category=None): """Pick a random word, optionally from a specific category.""" if category and category in self.by_category: pool = self.by_category[category] else: pool = self.all_words return random.choice(pool) if pool else None # --------------------------------------------------------------------------- # Meta-templates # --------------------------------------------------------------------------- class MetaTemplate: """Base class for meta-template families.""" id = "base" name = "Base Template" surface_templates = [] def __init__(self, graph): self.graph = graph def generate(self, seed_word=None, seed_category=None): """Attempt to generate a saying. Returns (saying, debug_info) or (None, None).""" raise NotImplementedError def _pick_template(self): return random.choice(self.surface_templates) def _seed(self, seed_word=None, seed_category=None): if seed_word: return seed_word.lower() return self.graph.random_word(seed_category) class Deconstruction(MetaTemplate): """A without B is just humble D.""" id = "deconstruction" name = "Deconstruction" surface_templates = [ "You know what they say, {A_article} with no {B} is just a {C} {D}.", "Take the {B} out of {A} and all you've got left is {C} {D}.", "{A} without {B}? That's just {D} with ideas above its station.", "{A_Article} ain't nothing but {D} that met some {B}.", ] def generate(self, seed_word=None, seed_category=None): a = self._seed(seed_word, seed_category) if not a: return None, None # Find what A is made of / requires ingredients = [] ingredient_rels = [] # track which relation found each ingredient for rel in ("MadeOf", "HasPrerequisite", "HasA"): found = _short_concepts(self.graph.neighbors(a, rel, min_weight=0.5)) for item in found: ingredients.append(item) ingredient_rels.append(rel) if len(ingredients) < 2: for rel in ("MadeOf", "HasPrerequisite"): for (start, w, s) in self.graph.reverse.get((a, rel), []): if len(start.split("_")) <= 2: ingredients.append((start, w, s)) ingredient_rels.append(rel) if len(ingredients) < 2: return None, None # Shuffle together combined = list(zip(ingredients, ingredient_rels)) random.shuffle(combined) ingredients, ingredient_rels = zip(*combined) b_edge = ingredients[0] b_word = _readable(b_edge[0]) b_rel = ingredient_rels[0] d_edge = ingredients[1] d_word = _readable(d_edge[0]) d_rel = ingredient_rels[1] # Find a property for D chain_edges = [ {"start": a, "relation": b_rel, "end": b_edge[0], "weight": b_edge[1], "surface_text": b_edge[2]}, {"start": a, "relation": d_rel, "end": d_edge[0], "weight": d_edge[1], "surface_text": d_edge[2]}, ] props = self.graph.neighbors(ingredients[1][0], "HasProperty") if props: c_prop = random.choice(props) c_word = _readable(c_prop[0]) chain_edges.append({"start": d_edge[0], "relation": "HasProperty", "end": c_prop[0], "weight": c_prop[1], "surface_text": c_prop[2]}) else: c_word = random.choice(["plain", "sorry", "old", "humble", "dry", "wet", "cold"]) template = self._pick_template() saying = template.format(A=a, A_article=_a(a), A_Article=_a(a).capitalize(), B=b_word, C=c_word, D=d_word) debug = { "template_family": self.id, "template": template, "chain": f"{a} MadeOf/Has [{b_word}, {d_word}]; {d_word} HasProperty {c_word}", "chain_edges": chain_edges, "slots": {"A": a, "B": b_word, "C": c_word, "D": d_word}, } return saying, debug class DenialOfConsequences(MetaTemplate): """Don't create conditions for B and deny B.""" id = "denial_of_consequences" name = "Denial of Consequences" surface_templates = [ "Don't {C} the {A} and say you ain't got {B}.", "Don't {C} the {A} and act surprised when the {B} show up.", "Man who {C}s {A_article} can't complain about {B}.", "You can't {C} {A_article} and then wonder where all the {B} came from.", ] def generate(self, seed_word=None, seed_category=None): a = self._seed(seed_word, seed_category) if not a: return None, None # What is found at A? (reverse: B AtLocation A) attracted = [] # (word, weight, surface_text, relation) for (b, w, s) in self.graph.reverse.get((a, "AtLocation"), []): attracted.append((b, w, s, "AtLocation")) # Also: what does A attract/cause? for rel in ("Causes", "CausesDesire"): for (b, w, s) in self.graph.edges.get((a, rel), []): attracted.append((b, w, s, rel)) if not attracted: for (bridge, target, w1, w2) in self.graph.two_hop(a, "UsedFor", "AtLocation"): attracted.append((target, w1 + w2, "", "AtLocation")) if not attracted: return None, None b_choice = random.choice(attracted) b_word = _readable(b_choice[0]) chain_edges = [ {"start": b_choice[0] if b_choice[3] == "AtLocation" else a, "relation": b_choice[3], "end": a if b_choice[3] == "AtLocation" else b_choice[0], "weight": b_choice[1], "surface_text": b_choice[2]}, ] create_verbs = { "pond": "dig", "birdhouse": "hang", "fence": "build", "trap": "set", "fire": "light", "garden": "plant", "nest": "build", "well": "dig", "bridge": "build", "barn": "raise", "path": "clear", "stable": "raise", "coop": "build", "den": "dig", "ditch": "dig", "furrow": "plow", "orchard": "plant", "hearth": "lay", "chimney": "build", } c_word = create_verbs.get(a) if not c_word: c_word = random.choice(["build", "set up", "put out", "lay down", "make"]) template = self._pick_template() saying = template.format(A=a, A_article=_a(a), B=b_word, C=c_word) debug = { "template_family": self.id, "template": template, "chain": f"{b_word} AtLocation {a}; {a} created by {c_word}", "chain_edges": chain_edges, "slots": {"A": a, "B": b_word, "C": c_word}, } return saying, debug class IronicDeficiency(MetaTemplate): """Producer of X lacks X.""" id = "ironic_deficiency" name = "Ironic Deficiency" surface_templates = [ "The {A}'s {F} always goes without {X}.", "Nobody's got less {X} than the man who makes the {A}.", "Funny how the {A} never has enough {X} for itself.", "The {A} feeds everyone's {X} but its own.", ] def generate(self, seed_word=None, seed_category=None): a = self._seed(seed_word, seed_category) if not a: return None, None products = [] product_rels = [] for rel in ("UsedFor", "CapableOf", "Causes"): found = self.graph.neighbors(a, rel, min_weight=0.5) for item in found: products.append(item) product_rels.append(rel) # Filter to short concepts while keeping rel tracking filtered = [(p, r) for p, r in zip(products, product_rels) if len(p[0].split("_")) <= 3] if not filtered: return None, None choice_idx = random.randrange(len(filtered)) x_edge, x_rel = filtered[choice_idx] x_word = _readable(x_edge[0]) family_members = ["wife", "children", "household", "family", "own kind"] f_word = random.choice(family_members) template = self._pick_template() saying = template.format(A=a, X=x_word, F=f_word) chain_edges = [ {"start": a, "relation": x_rel, "end": x_edge[0], "weight": x_edge[1], "surface_text": x_edge[2]}, ] debug = { "template_family": self.id, "template": template, "chain": f"{a} UsedFor/Produces {x_word}; irony: {a} lacks {x_word}", "chain_edges": chain_edges, "slots": {"A": a, "X": x_word, "F": f_word}, } return saying, debug class FutilePreparation(MetaTemplate): """Like doing A and hoping for unrelated Y.""" id = "futile_preparation" name = "Futile Preparation" surface_templates = [ "Like {A_gerund} and hoping for {Y}.", "That's just {A_gerund} and praying for {Y}.", "My grandmother used to say, '{A_gerund} won't bring you {Y}.'", "You can {A_verb} all you want, it still won't get you {Y}.", ] def generate(self, seed_word=None, seed_category=None): # Find an action and a desired outcome that are in the same domain but mismatched seed = self._seed(seed_word, seed_category) if not seed: return None, None # What is the seed used for? Filter out vocab nouns and noun-like words — we need verbs. uses = _short_concepts(self.graph.neighbors(seed, "UsedFor", min_weight=0.5), max_words=2) uses = [u for u in uses if u[0] not in self.graph.vocab and _looks_like_verb(u[0])] if not uses: return None, None action_edge = random.choice(uses) action_word = action_edge[0] chain_edges = [ {"start": seed, "relation": "UsedFor", "end": action_edge[0], "weight": action_edge[1], "surface_text": action_edge[2]}, ] # Find a different outcome in a related domain via 2-hop outcomes = [] for rel in ("Causes", "UsedFor", "HasSubevent"): hops = self.graph.two_hop_any(seed, "AtLocation", rel) outcomes.extend([(_readable(t), w1 + w2) for (_, t, w1, w2) in hops]) # Also try: things that siblings are UsedFor seed_cats = self.graph.vocab.get(seed, {}).get("categories", []) for cat in seed_cats: siblings = self.graph.by_category.get(cat, []) for sib in random.sample(siblings, min(5, len(siblings))): if sib != seed: for (target, w, s) in self.graph.edges.get((sib, "UsedFor"), []): if target != action_word: outcomes.append((_readable(target), w)) if not outcomes: return None, None y_choice = random.choice(outcomes) y_word = y_choice[0] gerund = _gerund(action_word) verb = _readable(action_word) template = self._pick_template() saying = template.format(A_gerund=f"{gerund} the {seed}", Y=y_word, A_verb=f"{verb} the {seed}") debug = { "template_family": self.id, "template": template, "chain": f"{seed} UsedFor {action_word}; different domain: {y_word}", "chain_edges": chain_edges, "slots": {"seed": seed, "action": action_word, "Y": y_word}, } return saying, debug class HypocriticalComplaint(MetaTemplate): """Consumes X from system Z, complains about remaining Y.""" id = "hypocritical_complaint" name = "Hypocritical Complaint" surface_templates = [ "There's a fella who {verb}s the {X} and says the {Y}'s no good.", "That's like eating the {X} and complaining the {Y} tastes off.", "He picks all the {X} then wonders why the {Y} looks bare.", "Don't {verb} the {X} and then gripe about the {Y}.", ] def generate(self, seed_word=None, seed_category=None): # Z is the whole, X and Y are parts z = self._seed(seed_word, seed_category) if not z: return None, None # Find parts of Z parts = [] part_rels = [] for rel in ("HasA", "PartOf", "MadeOf"): found = _short_concepts(self.graph.neighbors(z, rel, min_weight=0.5)) for item in found: parts.append(item) part_rels.append(rel) for (start, w, s) in self.graph.reverse.get((z, "PartOf"), []): if len(start.split("_")) <= 2: parts.append((start, w, s)) part_rels.append("PartOf") for (start, w, s) in self.graph.reverse.get((z, "HasA"), []): if len(start.split("_")) <= 2: parts.append((start, w, s)) part_rels.append("HasA") if len(parts) < 2: return None, None combined = list(zip(parts, part_rels)) random.shuffle(combined) parts, part_rels = zip(*combined) x_edge = parts[0] x_word = _readable(x_edge[0]) y_edge = parts[1] y_word = _readable(y_edge[0]) chain_edges = [ {"start": z, "relation": part_rels[0], "end": x_edge[0], "weight": x_edge[1], "surface_text": x_edge[2]}, {"start": z, "relation": part_rels[1], "end": y_edge[0], "weight": y_edge[1], "surface_text": y_edge[2]}, ] consume_verbs = ["eat", "drink", "take", "pick", "use up", "grab"] verb = random.choice(consume_verbs) template = self._pick_template() saying = template.format(X=x_word, Y=y_word, verb=verb) debug = { "template_family": self.id, "template": template, "chain": f"{x_word} PartOf/HasA {z}; {y_word} PartOf/HasA {z}", "chain_edges": chain_edges, "slots": {"Z": z, "X": x_word, "Y": y_word, "verb": verb}, } return saying, debug class TautologicalWisdom(MetaTemplate): """States obvious causal/prerequisite as wisdom.""" id = "tautological_wisdom" name = "Tautological Wisdom" surface_templates = [ "You know what they say, it takes {X_article} to get {Y_article}.", "My daddy always said, can't have {Y} without {X}.", "{Y_Article} don't come without its {X}, now does it?", "You want {Y}? Well, first you're gonna need {X}.", "Ain't no {Y} ever came from nothing — you need {X}.", ] def generate(self, seed_word=None, seed_category=None): seed = self._seed(seed_word, seed_category) if not seed: return None, None # seed HasPrerequisite/Causes something # Store (x_word, y_word, weight, edge_info) where edge_info captures the raw edge chains = [] for (target, w, s) in self.graph.edges.get((seed, "HasPrerequisite"), []): chains.append((_readable(target), seed, w, {"start": seed, "relation": "HasPrerequisite", "end": target, "weight": w, "surface_text": s})) for (target, w, s) in self.graph.edges.get((seed, "Causes"), []): chains.append((seed, _readable(target), w, {"start": seed, "relation": "Causes", "end": target, "weight": w, "surface_text": s})) # Also: what does seed require? for (source, w, s) in self.graph.reverse.get((seed, "HasPrerequisite"), []): chains.append((seed, _readable(source), w, {"start": source, "relation": "HasPrerequisite", "end": seed, "weight": w, "surface_text": s})) if not chains: return None, None choice = random.choice(chains) x_word, y_word = choice[0], choice[1] chain_edge = choice[3] template = self._pick_template() saying = template.format(X=x_word, Y=y_word, X_article=_a(x_word), Y_article=_a(y_word), Y_Article=_a(y_word).capitalize()) debug = { "template_family": self.id, "template": template, "chain": f"{x_word} -> {y_word} (prerequisite/cause)", "chain_edges": [chain_edge], "slots": {"X": x_word, "Y": y_word}, } return saying, debug class FalseEquivalence(MetaTemplate): """A is just B with/without property P.""" id = "false_equivalence" name = "False Equivalence" surface_templates = [ "{A_article} is just {B_article} that's got {P}.", "What's {A_article} but {B_article} with {P}?", "The only difference between {A_article} and {B_article} is {P}.", "Take the {P} from {A_article} and you've got yourself {B_article}.", ] def generate(self, seed_word=None, seed_category=None): a = self._seed(seed_word, seed_category) if not a: return None, None a_cats = set(self.graph.vocab.get(a, {}).get("categories", [])) if not a_cats: return None, None # Find siblings (same category, different word) siblings = [] for cat in a_cats: for sib in self.graph.by_category.get(cat, []): if sib != a: siblings.append(sib) if not siblings: return None, None b_word = random.choice(siblings) # Find a property of A that B might lack a_props = _short_concepts(self.graph.neighbors(a, "HasProperty"), max_words=2) b_props = set(p[0] for p in self.graph.neighbors(b_word, "HasProperty")) chain_edges = [] differentiators = [p for p in a_props if p[0] not in b_props] if differentiators: p_edge = random.choice(differentiators) p_word = _readable(p_edge[0]) chain_edges.append({"start": a, "relation": "HasProperty", "end": p_edge[0], "weight": p_edge[1], "surface_text": p_edge[2]}) elif a_props: p_edge = random.choice(a_props) p_word = _readable(p_edge[0]) chain_edges.append({"start": a, "relation": "HasProperty", "end": p_edge[0], "weight": p_edge[1], "surface_text": p_edge[2]}) else: a_caps = self.graph.neighbors(a, "CapableOf") if a_caps: p_edge = random.choice(a_caps) p_word = _readable(p_edge[0]) chain_edges.append({"start": a, "relation": "CapableOf", "end": p_edge[0], "weight": p_edge[1], "surface_text": p_edge[2]}) else: p_word = random.choice(["ambition", "an attitude", "a plan", "patience"]) template = self._pick_template() saying = template.format(A=a, B=b_word, P=p_word, A_article=_a(a), B_article=_a(b_word)) debug = { "template_family": self.id, "template": template, "chain": f"{a} IsA same category as {b_word}; {a} HasProperty {p_word}", "chain_edges": chain_edges, "slots": {"A": a, "B": b_word, "P": p_word}, } return saying, debug # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _readable(concept): """Convert ConceptNet concept to readable form: 'feed_chicken' -> 'feed chicken'.""" return concept.replace("_", " ") def _short_concepts(items, max_words=2): """Filter concept tuples to only those with short readable names. Items can be tuples where first element is the concept string. Returns items where the concept has at most max_words words. """ return [item for item in items if len(item[0].split("_")) <= max_words] def _gerund(word): """Rough gerund form of a verb/action word.""" word = word.split("_")[0] if "_" in word else word # take first word for compounds if word.endswith("e") and not word.endswith("ee"): return word[:-1] + "ing" if word.endswith("ing"): return word # CVC doubling: "run" -> "running", "sit" -> "sitting" # Treat 'y' and 'w' as vowels at word end (prey->preying, not preyying) if len(word) > 2 and word[-1] not in "aeiouy" and word[-2] in "aeiou" and word[-3] not in "aeiou": return word + word[-1] + "ing" return word + "ing" def _looks_like_verb(word): """Heuristic: does this word look like it could be a verb? Rejects words with obvious noun/adjective suffixes.""" w = word.split("_")[0].lower() if "_" in word else word.lower() noun_suffixes = ("tion", "sion", "ment", "ness", "ity", "ance", "ence", "ture", "ism", "ist", "ery", "ory", "ling") return not any(w.endswith(s) for s in noun_suffixes) def _a(word): """Add 'a' or 'an' article.""" if not word: return "a" first = word.split()[0].lower() if word else "" # Words that start with a vowel letter but consonant sound consonant_sound = ("uni", "use", "used", "user", "usual", "usu", "uti", "uto", "uro", "uku", "ula") # Words that start with a consonant letter but vowel sound vowel_sound = ("hour", "honest", "honor", "honour", "heir", "herb") if any(first.startswith(p) for p in vowel_sound): return f"an {word}" if any(first.startswith(p) for p in consonant_sound): return f"a {word}" if first[0] in "aeiou": return f"an {word}" return f"a {word}" TEMPLATE_REGISTRY = { "deconstruction": Deconstruction, "denial_of_consequences": DenialOfConsequences, "ironic_deficiency": IronicDeficiency, "futile_preparation": FutilePreparation, "hypocritical_complaint": HypocriticalComplaint, "tautological_wisdom": TautologicalWisdom, "false_equivalence": FalseEquivalence, } # --------------------------------------------------------------------------- # Main generation logic # --------------------------------------------------------------------------- def generate_one(graph, template_id=None, seed_word=None, seed_category=None, debug=False, max_retries=20): """Generate a single folksy saying. When debug=True, always returns (saying, debug_dict) with chain_edges included. """ for _ in range(max_retries): if template_id: tid = template_id else: tid = random.choice(list(TEMPLATE_REGISTRY.keys())) cls = TEMPLATE_REGISTRY.get(tid) if not cls: print(f"Unknown template: {tid}", file=sys.stderr) return None, None tmpl = cls(graph) saying, dbg = tmpl.generate(seed_word=seed_word, seed_category=seed_category) if saying: if debug: return saying, dbg return saying, None return None, None def _get_seed_word(dbg): """Extract the primary seed word from debug slots for dedup tracking.""" slots = dbg.get("slots", {}) # Templates use different slot names for the seed for key in ("A", "Z", "seed", "X"): if key in slots: return slots[key] return None def main(): parser = argparse.ArgumentParser( description="Generate folksy fake-proverbs using ConceptNet relationships." ) parser.add_argument("--template", "-t", choices=list(TEMPLATE_REGISTRY.keys()), help="Specify a meta-template family") parser.add_argument("--seed", "-s", help="Seed with a specific word") parser.add_argument("--category", "-c", help="Seed with a category (e.g., animal, tool)") parser.add_argument("--entities", "-e", help="Path to fictional entities JSON file") parser.add_argument("--count", "-n", type=int, default=1, help="Number of sayings to generate") parser.add_argument("--output", "-o", help="Output file (default: stdout)") parser.add_argument("--debug", "-d", action="store_true", help="Show relationship chain debug info") parser.add_argument("--json", action="store_true", help="Output JSONL format with full metadata") parser.add_argument("--vocab", help="Path to folksy_vocab.csv") parser.add_argument("--relations", help="Path to folksy_relations.csv") parser.add_argument("--pure-conceptnet", action="store_true", help="Skip loading augmented relations file") parser.add_argument("--llm-weight-boost", type=float, default=0.0, help="Boost weight of LLM-augmented edges with weight < 1.0 (default: 0.0)") parser.add_argument("--list-templates", action="store_true", help="List available templates") parser.add_argument("--list-categories", action="store_true", help="List available categories") args = parser.parse_args() if args.list_templates: for tid, cls in TEMPLATE_REGISTRY.items(): print(f" {tid:30s} {cls.name}") return # Load graph graph = FolksyGraph() try: graph.load( vocab_path=args.vocab or (DATA_DIR / "folksy_vocab.csv"), relations_path=args.relations or (DATA_DIR / "folksy_relations.csv"), ) except FileNotFoundError as e: print(f"Error: {e}", file=sys.stderr) print("Run scripts/extract_from_conceptnet.py first to generate data files.", file=sys.stderr) sys.exit(1) # Load augmented relations if available if not args.pure_conceptnet: augmented_path = DATA_DIR / "folksy_relations_augmented.csv" if augmented_path.exists(): boost = args.llm_weight_boost with open(augmented_path, newline="", encoding="utf-8") as f: reader = csv.DictReader(f) count = 0 for row in reader: sw = row["start_word"] ew = row["end_word"] rel = row["relation"] w = float(row["weight"]) if w < 1.0 and boost: w = min(w + boost, 1.0) surf = row.get("surface_text", "") graph.edges[(sw, rel)].append((ew, w, surf)) graph.reverse[(ew, rel)].append((sw, w, surf)) graph.all_edges[sw].append((ew, rel, w)) graph.all_edges[ew].append((sw, rel, w)) count += 1 if count: print(f"Loaded {count} augmented edges.", file=sys.stderr) if args.list_categories: for cat in sorted(graph.by_category.keys()): print(f" {cat:20s} ({len(graph.by_category[cat])} words)") return # Merge fictional entities if args.entities: graph.merge_fictional(args.entities) # JSON mode implies debug internally use_debug = args.debug or args.json # Generate out = open(args.output, "w", encoding="utf-8") if args.output else sys.stdout try: if args.count > 1: # Deduplication tracking for batch mode seen_text = set() seen_slots = set() seed_usage = defaultdict(int) generated = 0 max_outer_attempts = args.count * 10 # generous outer limit attempts = 0 while generated < args.count and attempts < max_outer_attempts: attempts += 1 saying, dbg = generate_one( graph, template_id=args.template, seed_word=args.seed, seed_category=args.category, debug=use_debug, ) if not saying: continue # Dedup checks (failures don't count against retry limit) if saying in seen_text: continue if dbg: slots_key = (dbg["template_family"], frozenset(dbg["slots"].items())) if slots_key in seen_slots: continue seed_w = _get_seed_word(dbg) if seed_w and seed_usage[seed_w] >= 30: continue if seed_w: seed_usage[seed_w] += 1 seen_slots.add(slots_key) seen_text.add(saying) generated += 1 if args.json and dbg: record = { "raw_text": saying, "meta_template": dbg["template_family"], "surface_template": dbg["template"], "slots": dbg["slots"], "chain": dbg.get("chain_edges", []), } out.write(json.dumps(record, ensure_ascii=False) + "\n") else: out.write(saying + "\n") if args.debug and dbg: out.write(f" [DEBUG] family={dbg['template_family']}\n") out.write(f" [DEBUG] chain: {dbg['chain']}\n") out.write(f" [DEBUG] slots: {dbg['slots']}\n") out.write("\n") else: # Single generation (no dedup needed) saying, dbg = generate_one( graph, template_id=args.template, seed_word=args.seed, seed_category=args.category, debug=use_debug, ) if saying: if args.json and dbg: record = { "raw_text": saying, "meta_template": dbg["template_family"], "surface_template": dbg["template"], "slots": dbg["slots"], "chain": dbg.get("chain_edges", []), } out.write(json.dumps(record, ensure_ascii=False) + "\n") else: out.write(saying + "\n") if args.debug and dbg: out.write(f" [DEBUG] family={dbg['template_family']}\n") out.write(f" [DEBUG] chain: {dbg['chain']}\n") out.write(f" [DEBUG] slots: {dbg['slots']}\n") out.write("\n") else: out.write("(failed to generate saying after retries)\n") finally: if args.output: out.close() if __name__ == "__main__": main()