folksy_idioms/folksy_generator.py

#!/usr/bin/env python3
"""Folksy Idiom Generator — Procedural fake-proverb generator using ConceptNet relationships."""

import argparse
import csv
import json
import os
import random
import sys
from collections import defaultdict
from pathlib import Path

DATA_DIR = Path(__file__).parent / "data"

# ---------------------------------------------------------------------------
# Graph data structures
# ---------------------------------------------------------------------------

class FolksyGraph:
    """In-memory graph of folksy vocabulary and their ConceptNet relationships."""

    def __init__(self):
        self.vocab = {}           # word -> {categories, tangibility, edge_count}
        self.by_category = defaultdict(list)  # category -> [words]
        self.edges = defaultdict(list)        # (start, relation) -> [(end, weight, surface)]
        self.reverse = defaultdict(list)      # (end, relation) -> [(start, weight, surface)]
        self.all_edges = defaultdict(list)    # start -> [(end, relation, weight)]
        self.all_words = []

    def load(self, vocab_path=None, relations_path=None):
        vocab_path = vocab_path or (DATA_DIR / "folksy_vocab.csv")
        relations_path = relations_path or (DATA_DIR / "folksy_relations.csv")

        with open(vocab_path, newline="", encoding="utf-8") as f:
            reader = csv.DictReader(f)
            for row in reader:
                word = row["word"]
                cats = [c.strip() for c in row["categories"].split(",") if c.strip()]
                self.vocab[word] = {
                    "categories": cats,
                    "tangibility": float(row.get("tangibility_score", 0)),
                    "edge_count": int(row.get("conceptnet_edge_count", 0)),
                }
                for cat in cats:
                    self.by_category[cat].append(word)
        self.all_words = list(self.vocab.keys())

        with open(relations_path, newline="", encoding="utf-8") as f:
            reader = csv.DictReader(f)
            for row in reader:
                sw = row["start_word"]
                ew = row["end_word"]
                rel = row["relation"]
                w = float(row["weight"])
                surf = row.get("surface_text", "")
                self.edges[(sw, rel)].append((ew, w, surf))
                self.reverse[(ew, rel)].append((sw, w, surf))
                self.all_edges[sw].append((ew, rel, w))
                self.all_edges[ew].append((sw, rel, w))

    def merge_fictional(self, entities_path):
        """Merge fictional entities into the graph."""
        with open(entities_path, encoding="utf-8") as f:
            data = json.load(f)

        for entity in data.get("entities", []):
            name = entity["name"].lower()
            cats = entity.get("categories", [])
            props = entity.get("properties", [])

            # Inherit from parents
            inherited_relations = defaultdict(list)
            for parent in entity.get("derived_from", []):
                parent = parent.lower()
                if parent in self.vocab:
                    parent_cats = self.vocab[parent]["categories"]
                    cats = list(set(cats + parent_cats))
                # Gather all edges from parent
                for (sw, rel), targets in list(self.edges.items()):
                    if sw == parent:
                        for (ew, w, surf) in targets:
                            inherited_relations[rel].append((ew, w, ""))
                for (ew, rel), sources in list(self.reverse.items()):
                    if ew == parent:
                        for (sw, w, surf) in sources:
                            inherited_relations[rel].append((sw, w, ""))

            # Register the entity as a vocab word
            self.vocab[name] = {
                "categories": cats,
                "tangibility": 0.5,
                "edge_count": 0,
            }
            for cat in cats:
                self.by_category[cat].append(name)
            self.all_words.append(name)

            # Add inherited relations (lower priority)
            for rel, targets in inherited_relations.items():
                for (target, w, surf) in targets:
                    self.edges[(name, rel)].append((target, w, ""))
                    self.reverse[(target, rel)].append((name, w, ""))
                    self.all_edges[name].append((target, rel, w))

            # Add explicit relations (override)
            for rel, targets in entity.get("relations", {}).items():
                for target in targets:
                    target_lower = target.lower()
                    self.edges[(name, rel)].append((target_lower, 2.0, ""))
                    self.reverse[(target_lower, rel)].append((name, 2.0, ""))
                    self.all_edges[name].append((target_lower, rel, 2.0))

            # Add properties as HasProperty edges
            for prop in props:
                self.edges[(name, "HasProperty")].append((prop.lower(), 2.0, ""))
                self.all_edges[name].append((prop.lower(), "HasProperty", 2.0))

    def neighbors(self, word, relation=None, min_weight=0.0, vocab_only=False):
        """Get neighbors of a word, optionally filtered by relation type.

        Args:
            vocab_only: If True, only return neighbors that are in the folksy vocab.
                        If False (default), return all neighbors including action
                        phrases, properties, etc.
        """
        if relation:
            return [(ew, w, s) for (ew, w, s) in self.edges.get((word, relation), [])
                    if w >= min_weight and (not vocab_only or ew in self.vocab)]
        results = []
        for (ew, rel, w) in self.all_edges.get(word, []):
            if w >= min_weight and (not vocab_only or ew in self.vocab):
                results.append((ew, rel, w))
        return results

    def vocab_neighbors(self, word, relation=None, min_weight=0.0):
        """Get neighbors restricted to folksy vocab words only."""
        return self.neighbors(word, relation, min_weight, vocab_only=True)

    def two_hop(self, word, rel1, rel2, min_weight=0.5):
        """Find 2-hop paths: word -[rel1]-> bridge -[rel2]-> target.

        Bridge can be any word; target must be in folksy vocab.
        """
        results = []
        for (bridge, w1, _) in self.edges.get((word, rel1), []):
            for (target, w2, _) in self.edges.get((bridge, rel2), []):
                if target != word and target in self.vocab and w2 >= min_weight:
                    results.append((bridge, target, w1, w2))
        return results

    def two_hop_any(self, word, rel1, rel2, min_weight=0.5):
        """Find 2-hop paths where target can be any word (not just vocab)."""
        results = []
        for (bridge, w1, _) in self.edges.get((word, rel1), []):
            for (target, w2, _) in self.edges.get((bridge, rel2), []):
                if target != word and w2 >= min_weight:
                    results.append((bridge, target, w1, w2))
        return results

    def random_word(self, category=None):
        """Pick a random word, optionally from a specific category."""
        if category and category in self.by_category:
            pool = self.by_category[category]
        else:
            pool = self.all_words
        return random.choice(pool) if pool else None


# ---------------------------------------------------------------------------
# Meta-templates
# ---------------------------------------------------------------------------

class MetaTemplate:
    """Base class for meta-template families."""

    id = "base"
    name = "Base Template"
    surface_templates = []

    def __init__(self, graph):
        self.graph = graph

    def generate(self, seed_word=None, seed_category=None):
        """Attempt to generate a saying. Returns (saying, debug_info) or (None, None)."""
        raise NotImplementedError

    def _pick_template(self):
        return random.choice(self.surface_templates)

    def _seed(self, seed_word=None, seed_category=None):
        if seed_word:
            return seed_word.lower()
        return self.graph.random_word(seed_category)


class Deconstruction(MetaTemplate):
    """A without B is just humble D."""

    id = "deconstruction"
    name = "Deconstruction"
    surface_templates = [
        "You know what they say, {A_article} with no {B} is just a {C} {D}.",
        "Take the {B} out of {A} and all you've got left is {C} {D}.",
        "{A} without {B}? That's just {D} with ideas above its station.",
        "{A_Article} ain't nothing but {D} that met some {B}.",
    ]

    def generate(self, seed_word=None, seed_category=None):
        a = self._seed(seed_word, seed_category)
        if not a:
            return None, None

        # Find what A is made of / requires
        ingredients = []
        ingredient_rels = []  # track which relation found each ingredient
        for rel in ("MadeOf", "HasPrerequisite", "HasA"):
            found = _short_concepts(self.graph.neighbors(a, rel, min_weight=0.5))
            for item in found:
                ingredients.append(item)
                ingredient_rels.append(rel)

        if len(ingredients) < 2:
            for rel in ("MadeOf", "HasPrerequisite"):
                for (start, w, s) in self.graph.reverse.get((a, rel), []):
                    if len(start.split("_")) <= 2:
                        ingredients.append((start, w, s))
                        ingredient_rels.append(rel)

        if len(ingredients) < 2:
            return None, None

        # Shuffle together
        combined = list(zip(ingredients, ingredient_rels))
        random.shuffle(combined)
        ingredients, ingredient_rels = zip(*combined)

        b_edge = ingredients[0]
        b_word = _readable(b_edge[0])
        b_rel = ingredient_rels[0]
        d_edge = ingredients[1]
        d_word = _readable(d_edge[0])
        d_rel = ingredient_rels[1]

        # Find a property for D
        chain_edges = [
            {"start": a, "relation": b_rel, "end": b_edge[0], "weight": b_edge[1], "surface_text": b_edge[2]},
            {"start": a, "relation": d_rel, "end": d_edge[0], "weight": d_edge[1], "surface_text": d_edge[2]},
        ]
        props = self.graph.neighbors(ingredients[1][0], "HasProperty")
        if props:
            c_prop = random.choice(props)
            c_word = _readable(c_prop[0])
            chain_edges.append({"start": d_edge[0], "relation": "HasProperty", "end": c_prop[0], "weight": c_prop[1], "surface_text": c_prop[2]})
        else:
            c_word = random.choice(["plain", "sorry", "old", "humble", "dry", "wet", "cold"])

        template = self._pick_template()
        saying = template.format(A=a, A_article=_a(a), A_Article=_a(a).capitalize(),
                                  B=b_word, C=c_word, D=d_word)

        debug = {
            "template_family": self.id,
            "template": template,
            "chain": f"{a} MadeOf/Has [{b_word}, {d_word}]; {d_word} HasProperty {c_word}",
            "chain_edges": chain_edges,
            "slots": {"A": a, "B": b_word, "C": c_word, "D": d_word},
        }
        return saying, debug


class DenialOfConsequences(MetaTemplate):
    """Don't create conditions for B and deny B."""

    id = "denial_of_consequences"
    name = "Denial of Consequences"
    surface_templates = [
        "Don't {C} the {A} and say you ain't got {B}.",
        "Don't {C} the {A} and act surprised when the {B} show up.",
        "Man who {C}s {A_article} can't complain about {B}.",
        "You can't {C} {A_article} and then wonder where all the {B} came from.",
    ]

    def generate(self, seed_word=None, seed_category=None):
        a = self._seed(seed_word, seed_category)
        if not a:
            return None, None

        # What is found at A? (reverse: B AtLocation A)
        attracted = []  # (word, weight, surface_text, relation)
        for (b, w, s) in self.graph.reverse.get((a, "AtLocation"), []):
            attracted.append((b, w, s, "AtLocation"))

        # Also: what does A attract/cause?
        for rel in ("Causes", "CausesDesire"):
            for (b, w, s) in self.graph.edges.get((a, rel), []):
                attracted.append((b, w, s, rel))

        if not attracted:
            for (bridge, target, w1, w2) in self.graph.two_hop(a, "UsedFor", "AtLocation"):
                attracted.append((target, w1 + w2, "", "AtLocation"))

        if not attracted:
            return None, None

        b_choice = random.choice(attracted)
        b_word = _readable(b_choice[0])

        chain_edges = [
            {"start": b_choice[0] if b_choice[3] == "AtLocation" else a,
             "relation": b_choice[3],
             "end": a if b_choice[3] == "AtLocation" else b_choice[0],
             "weight": b_choice[1], "surface_text": b_choice[2]},
        ]

        create_verbs = {
            "pond": "dig", "birdhouse": "hang", "fence": "build", "trap": "set",
            "fire": "light", "garden": "plant", "nest": "build", "well": "dig",
            "bridge": "build", "barn": "raise", "path": "clear", "stable": "raise",
            "coop": "build", "den": "dig", "ditch": "dig", "furrow": "plow",
            "orchard": "plant", "hearth": "lay", "chimney": "build",
        }
        c_word = create_verbs.get(a)
        if not c_word:
            c_word = random.choice(["build", "set up", "put out", "lay down", "make"])

        template = self._pick_template()
        saying = template.format(A=a, A_article=_a(a), B=b_word, C=c_word)

        debug = {
            "template_family": self.id,
            "template": template,
            "chain": f"{b_word} AtLocation {a}; {a} created by {c_word}",
            "chain_edges": chain_edges,
            "slots": {"A": a, "B": b_word, "C": c_word},
        }
        return saying, debug


class IronicDeficiency(MetaTemplate):
    """Producer of X lacks X."""

    id = "ironic_deficiency"
    name = "Ironic Deficiency"
    surface_templates = [
        "The {A}'s {F} always goes without {X}.",
        "Nobody's got less {X} than the man who makes the {A}.",
        "Funny how the {A} never has enough {X} for itself.",
        "The {A} feeds everyone's {X} but its own.",
    ]

    def generate(self, seed_word=None, seed_category=None):
        a = self._seed(seed_word, seed_category)
        if not a:
            return None, None

        products = []
        product_rels = []
        for rel in ("UsedFor", "CapableOf", "Causes"):
            found = self.graph.neighbors(a, rel, min_weight=0.5)
            for item in found:
                products.append(item)
                product_rels.append(rel)

        # Filter to short concepts while keeping rel tracking
        filtered = [(p, r) for p, r in zip(products, product_rels) if len(p[0].split("_")) <= 3]
        if not filtered:
            return None, None

        choice_idx = random.randrange(len(filtered))
        x_edge, x_rel = filtered[choice_idx]
        x_word = _readable(x_edge[0])

        family_members = ["wife", "children", "household", "family", "own kind"]
        f_word = random.choice(family_members)

        template = self._pick_template()
        saying = template.format(A=a, X=x_word, F=f_word)

        chain_edges = [
            {"start": a, "relation": x_rel, "end": x_edge[0], "weight": x_edge[1], "surface_text": x_edge[2]},
        ]

        debug = {
            "template_family": self.id,
            "template": template,
            "chain": f"{a} UsedFor/Produces {x_word}; irony: {a} lacks {x_word}",
            "chain_edges": chain_edges,
            "slots": {"A": a, "X": x_word, "F": f_word},
        }
        return saying, debug


class FutilePreparation(MetaTemplate):
    """Like doing A and hoping for unrelated Y."""

    id = "futile_preparation"
    name = "Futile Preparation"
    surface_templates = [
        "Like {A_gerund} and hoping for {Y}.",
        "That's just {A_gerund} and praying for {Y}.",
        "My grandmother used to say, '{A_gerund} won't bring you {Y}.'",
        "You can {A_verb} all you want, it still won't get you {Y}.",
    ]

    def generate(self, seed_word=None, seed_category=None):
        # Find an action and a desired outcome that are in the same domain but mismatched
        seed = self._seed(seed_word, seed_category)
        if not seed:
            return None, None

        # What is the seed used for? Filter out vocab nouns and noun-like words — we need verbs.
        uses = _short_concepts(self.graph.neighbors(seed, "UsedFor", min_weight=0.5), max_words=2)
        uses = [u for u in uses if u[0] not in self.graph.vocab and _looks_like_verb(u[0])]
        if not uses:
            return None, None

        action_edge = random.choice(uses)
        action_word = action_edge[0]

        chain_edges = [
            {"start": seed, "relation": "UsedFor", "end": action_edge[0], "weight": action_edge[1], "surface_text": action_edge[2]},
        ]

        # Find a different outcome in a related domain via 2-hop
        outcomes = []
        for rel in ("Causes", "UsedFor", "HasSubevent"):
            hops = self.graph.two_hop_any(seed, "AtLocation", rel)
            outcomes.extend([(_readable(t), w1 + w2) for (_, t, w1, w2) in hops])

        # Also try: things that siblings are UsedFor
        seed_cats = self.graph.vocab.get(seed, {}).get("categories", [])
        for cat in seed_cats:
            siblings = self.graph.by_category.get(cat, [])
            for sib in random.sample(siblings, min(5, len(siblings))):
                if sib != seed:
                    for (target, w, s) in self.graph.edges.get((sib, "UsedFor"), []):
                        if target != action_word:
                            outcomes.append((_readable(target), w))

        if not outcomes:
            return None, None

        y_choice = random.choice(outcomes)
        y_word = y_choice[0]

        gerund = _gerund(action_word)
        verb = _readable(action_word)

        template = self._pick_template()
        saying = template.format(A_gerund=f"{gerund} the {seed}", Y=y_word,
                                 A_verb=f"{verb} the {seed}")

        debug = {
            "template_family": self.id,
            "template": template,
            "chain": f"{seed} UsedFor {action_word}; different domain: {y_word}",
            "chain_edges": chain_edges,
            "slots": {"seed": seed, "action": action_word, "Y": y_word},
        }
        return saying, debug


class HypocriticalComplaint(MetaTemplate):
    """Consumes X from system Z, complains about remaining Y."""

    id = "hypocritical_complaint"
    name = "Hypocritical Complaint"
    surface_templates = [
        "There's a fella who {verb}s the {X} and says the {Y}'s no good.",
        "That's like eating the {X} and complaining the {Y} tastes off.",
        "He picks all the {X} then wonders why the {Y} looks bare.",
        "Don't {verb} the {X} and then gripe about the {Y}.",
    ]

    def generate(self, seed_word=None, seed_category=None):
        # Z is the whole, X and Y are parts
        z = self._seed(seed_word, seed_category)
        if not z:
            return None, None

        # Find parts of Z
        parts = []
        part_rels = []
        for rel in ("HasA", "PartOf", "MadeOf"):
            found = _short_concepts(self.graph.neighbors(z, rel, min_weight=0.5))
            for item in found:
                parts.append(item)
                part_rels.append(rel)
            for (start, w, s) in self.graph.reverse.get((z, "PartOf"), []):
                if len(start.split("_")) <= 2:
                    parts.append((start, w, s))
                    part_rels.append("PartOf")
            for (start, w, s) in self.graph.reverse.get((z, "HasA"), []):
                if len(start.split("_")) <= 2:
                    parts.append((start, w, s))
                    part_rels.append("HasA")

        if len(parts) < 2:
            return None, None

        combined = list(zip(parts, part_rels))
        random.shuffle(combined)
        parts, part_rels = zip(*combined)

        x_edge = parts[0]
        x_word = _readable(x_edge[0])
        y_edge = parts[1]
        y_word = _readable(y_edge[0])

        chain_edges = [
            {"start": z, "relation": part_rels[0], "end": x_edge[0], "weight": x_edge[1], "surface_text": x_edge[2]},
            {"start": z, "relation": part_rels[1], "end": y_edge[0], "weight": y_edge[1], "surface_text": y_edge[2]},
        ]

        consume_verbs = ["eat", "drink", "take", "pick", "use up", "grab"]
        verb = random.choice(consume_verbs)

        template = self._pick_template()
        saying = template.format(X=x_word, Y=y_word, verb=verb)

        debug = {
            "template_family": self.id,
            "template": template,
            "chain": f"{x_word} PartOf/HasA {z}; {y_word} PartOf/HasA {z}",
            "chain_edges": chain_edges,
            "slots": {"Z": z, "X": x_word, "Y": y_word, "verb": verb},
        }
        return saying, debug


class TautologicalWisdom(MetaTemplate):
    """States obvious causal/prerequisite as wisdom."""

    id = "tautological_wisdom"
    name = "Tautological Wisdom"
    surface_templates = [
        "You know what they say, it takes {X_article} to get {Y_article}.",
        "My daddy always said, can't have {Y} without {X}.",
        "{Y_Article} don't come without its {X}, now does it?",
        "You want {Y}? Well, first you're gonna need {X}.",
        "Ain't no {Y} ever came from nothing — you need {X}.",
    ]

    def generate(self, seed_word=None, seed_category=None):
        seed = self._seed(seed_word, seed_category)
        if not seed:
            return None, None

        # seed HasPrerequisite/Causes something
        # Store (x_word, y_word, weight, edge_info) where edge_info captures the raw edge
        chains = []
        for (target, w, s) in self.graph.edges.get((seed, "HasPrerequisite"), []):
            chains.append((_readable(target), seed, w,
                           {"start": seed, "relation": "HasPrerequisite", "end": target, "weight": w, "surface_text": s}))
        for (target, w, s) in self.graph.edges.get((seed, "Causes"), []):
            chains.append((seed, _readable(target), w,
                           {"start": seed, "relation": "Causes", "end": target, "weight": w, "surface_text": s}))
        # Also: what does seed require?
        for (source, w, s) in self.graph.reverse.get((seed, "HasPrerequisite"), []):
            chains.append((seed, _readable(source), w,
                           {"start": source, "relation": "HasPrerequisite", "end": seed, "weight": w, "surface_text": s}))

        if not chains:
            return None, None

        choice = random.choice(chains)
        x_word, y_word = choice[0], choice[1]
        chain_edge = choice[3]

        template = self._pick_template()
        saying = template.format(X=x_word, Y=y_word,
                                  X_article=_a(x_word), Y_article=_a(y_word),
                                  Y_Article=_a(y_word).capitalize())

        debug = {
            "template_family": self.id,
            "template": template,
            "chain": f"{x_word} -> {y_word} (prerequisite/cause)",
            "chain_edges": [chain_edge],
            "slots": {"X": x_word, "Y": y_word},
        }
        return saying, debug


class FalseEquivalence(MetaTemplate):
    """A is just B with/without property P."""

    id = "false_equivalence"
    name = "False Equivalence"
    surface_templates = [
        "{A_article} is just {B_article} that's got {P}.",
        "What's {A_article} but {B_article} with {P}?",
        "The only difference between {A_article} and {B_article} is {P}.",
        "Take the {P} from {A_article} and you've got yourself {B_article}.",
    ]

    def generate(self, seed_word=None, seed_category=None):
        a = self._seed(seed_word, seed_category)
        if not a:
            return None, None

        a_cats = set(self.graph.vocab.get(a, {}).get("categories", []))
        if not a_cats:
            return None, None

        # Find siblings (same category, different word)
        siblings = []
        for cat in a_cats:
            for sib in self.graph.by_category.get(cat, []):
                if sib != a:
                    siblings.append(sib)

        if not siblings:
            return None, None

        b_word = random.choice(siblings)

        # Find a property of A that B might lack
        a_props = _short_concepts(self.graph.neighbors(a, "HasProperty"), max_words=2)
        b_props = set(p[0] for p in self.graph.neighbors(b_word, "HasProperty"))

        chain_edges = []
        differentiators = [p for p in a_props if p[0] not in b_props]
        if differentiators:
            p_edge = random.choice(differentiators)
            p_word = _readable(p_edge[0])
            chain_edges.append({"start": a, "relation": "HasProperty", "end": p_edge[0], "weight": p_edge[1], "surface_text": p_edge[2]})
        elif a_props:
            p_edge = random.choice(a_props)
            p_word = _readable(p_edge[0])
            chain_edges.append({"start": a, "relation": "HasProperty", "end": p_edge[0], "weight": p_edge[1], "surface_text": p_edge[2]})
        else:
            a_caps = self.graph.neighbors(a, "CapableOf")
            if a_caps:
                p_edge = random.choice(a_caps)
                p_word = _readable(p_edge[0])
                chain_edges.append({"start": a, "relation": "CapableOf", "end": p_edge[0], "weight": p_edge[1], "surface_text": p_edge[2]})
            else:
                p_word = random.choice(["ambition", "an attitude", "a plan", "patience"])

        template = self._pick_template()
        saying = template.format(A=a, B=b_word, P=p_word,
                                 A_article=_a(a), B_article=_a(b_word))

        debug = {
            "template_family": self.id,
            "template": template,
            "chain": f"{a} IsA same category as {b_word}; {a} HasProperty {p_word}",
            "chain_edges": chain_edges,
            "slots": {"A": a, "B": b_word, "P": p_word},
        }
        return saying, debug


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

def _readable(concept):
    """Convert ConceptNet concept to readable form: 'feed_chicken' -> 'feed chicken'."""
    return concept.replace("_", " ")


def _short_concepts(items, max_words=2):
    """Filter concept tuples to only those with short readable names.

    Items can be tuples where first element is the concept string.
    Returns items where the concept has at most max_words words.
    """
    return [item for item in items if len(item[0].split("_")) <= max_words]


def _gerund(word):
    """Rough gerund form of a verb/action word."""
    word = word.split("_")[0] if "_" in word else word  # take first word for compounds
    if word.endswith("e") and not word.endswith("ee"):
        return word[:-1] + "ing"
    if word.endswith("ing"):
        return word
    # CVC doubling: "run" -> "running", "sit" -> "sitting"
    # Treat 'y' and 'w' as vowels at word end (prey->preying, not preyying)
    if len(word) > 2 and word[-1] not in "aeiouy" and word[-2] in "aeiou" and word[-3] not in "aeiou":
        return word + word[-1] + "ing"
    return word + "ing"


def _looks_like_verb(word):
    """Heuristic: does this word look like it could be a verb?
    Rejects words with obvious noun/adjective suffixes."""
    w = word.split("_")[0].lower() if "_" in word else word.lower()
    noun_suffixes = ("tion", "sion", "ment", "ness", "ity", "ance", "ence",
                     "ture", "ism", "ist", "ery", "ory", "ling")
    return not any(w.endswith(s) for s in noun_suffixes)


def _a(word):
    """Add 'a' or 'an' article."""
    if not word:
        return "a"
    first = word.split()[0].lower() if word else ""
    # Words that start with a vowel letter but consonant sound
    consonant_sound = ("uni", "use", "used", "user", "usual", "usu", "uti", "uto",
                       "uro", "uku", "ula")
    # Words that start with a consonant letter but vowel sound
    vowel_sound = ("hour", "honest", "honor", "honour", "heir", "herb")
    if any(first.startswith(p) for p in vowel_sound):
        return f"an {word}"
    if any(first.startswith(p) for p in consonant_sound):
        return f"a {word}"
    if first[0] in "aeiou":
        return f"an {word}"
    return f"a {word}"


TEMPLATE_REGISTRY = {
    "deconstruction": Deconstruction,
    "denial_of_consequences": DenialOfConsequences,
    "ironic_deficiency": IronicDeficiency,
    "futile_preparation": FutilePreparation,
    "hypocritical_complaint": HypocriticalComplaint,
    "tautological_wisdom": TautologicalWisdom,
    "false_equivalence": FalseEquivalence,
}


# ---------------------------------------------------------------------------
# Main generation logic
# ---------------------------------------------------------------------------

def generate_one(graph, template_id=None, seed_word=None, seed_category=None,
                 debug=False, max_retries=20):
    """Generate a single folksy saying.

    When debug=True, always returns (saying, debug_dict) with chain_edges included.
    """
    for _ in range(max_retries):
        if template_id:
            tid = template_id
        else:
            tid = random.choice(list(TEMPLATE_REGISTRY.keys()))

        cls = TEMPLATE_REGISTRY.get(tid)
        if not cls:
            print(f"Unknown template: {tid}", file=sys.stderr)
            return None, None

        tmpl = cls(graph)
        saying, dbg = tmpl.generate(seed_word=seed_word, seed_category=seed_category)
        if saying:
            if debug:
                return saying, dbg
            return saying, None

    return None, None


def _get_seed_word(dbg):
    """Extract the primary seed word from debug slots for dedup tracking."""
    slots = dbg.get("slots", {})
    # Templates use different slot names for the seed
    for key in ("A", "Z", "seed", "X"):
        if key in slots:
            return slots[key]
    return None


def main():
    parser = argparse.ArgumentParser(
        description="Generate folksy fake-proverbs using ConceptNet relationships."
    )
    parser.add_argument("--template", "-t", choices=list(TEMPLATE_REGISTRY.keys()),
                        help="Specify a meta-template family")
    parser.add_argument("--seed", "-s", help="Seed with a specific word")
    parser.add_argument("--category", "-c", help="Seed with a category (e.g., animal, tool)")
    parser.add_argument("--entities", "-e", help="Path to fictional entities JSON file")
    parser.add_argument("--count", "-n", type=int, default=1, help="Number of sayings to generate")
    parser.add_argument("--output", "-o", help="Output file (default: stdout)")
    parser.add_argument("--debug", "-d", action="store_true", help="Show relationship chain debug info")
    parser.add_argument("--json", action="store_true", help="Output JSONL format with full metadata")
    parser.add_argument("--vocab", help="Path to folksy_vocab.csv")
    parser.add_argument("--relations", help="Path to folksy_relations.csv")
    parser.add_argument("--pure-conceptnet", action="store_true",
                        help="Skip loading augmented relations file")
    parser.add_argument("--llm-weight-boost", type=float, default=0.0,
                        help="Boost weight of LLM-augmented edges with weight < 1.0 (default: 0.0)")
    parser.add_argument("--list-templates", action="store_true", help="List available templates")
    parser.add_argument("--list-categories", action="store_true", help="List available categories")

    args = parser.parse_args()

    if args.list_templates:
        for tid, cls in TEMPLATE_REGISTRY.items():
            print(f"  {tid:30s} {cls.name}")
        return

    # Load graph
    graph = FolksyGraph()
    try:
        graph.load(
            vocab_path=args.vocab or (DATA_DIR / "folksy_vocab.csv"),
            relations_path=args.relations or (DATA_DIR / "folksy_relations.csv"),
        )
    except FileNotFoundError as e:
        print(f"Error: {e}", file=sys.stderr)
        print("Run scripts/extract_from_conceptnet.py first to generate data files.", file=sys.stderr)
        sys.exit(1)

    # Load augmented relations if available
    if not args.pure_conceptnet:
        augmented_path = DATA_DIR / "folksy_relations_augmented.csv"
        if augmented_path.exists():
            boost = args.llm_weight_boost
            with open(augmented_path, newline="", encoding="utf-8") as f:
                reader = csv.DictReader(f)
                count = 0
                for row in reader:
                    sw = row["start_word"]
                    ew = row["end_word"]
                    rel = row["relation"]
                    w = float(row["weight"])
                    if w < 1.0 and boost:
                        w = min(w + boost, 1.0)
                    surf = row.get("surface_text", "")
                    graph.edges[(sw, rel)].append((ew, w, surf))
                    graph.reverse[(ew, rel)].append((sw, w, surf))
                    graph.all_edges[sw].append((ew, rel, w))
                    graph.all_edges[ew].append((sw, rel, w))
                    count += 1
            if count:
                print(f"Loaded {count} augmented edges.", file=sys.stderr)

    if args.list_categories:
        for cat in sorted(graph.by_category.keys()):
            print(f"  {cat:20s} ({len(graph.by_category[cat])} words)")
        return

    # Merge fictional entities
    if args.entities:
        graph.merge_fictional(args.entities)

    # JSON mode implies debug internally
    use_debug = args.debug or args.json

    # Generate
    out = open(args.output, "w", encoding="utf-8") if args.output else sys.stdout
    try:
        if args.count > 1:
            # Deduplication tracking for batch mode
            seen_text = set()
            seen_slots = set()
            seed_usage = defaultdict(int)
            generated = 0
            max_outer_attempts = args.count * 10  # generous outer limit
            attempts = 0

            while generated < args.count and attempts < max_outer_attempts:
                attempts += 1
                saying, dbg = generate_one(
                    graph,
                    template_id=args.template,
                    seed_word=args.seed,
                    seed_category=args.category,
                    debug=use_debug,
                )
                if not saying:
                    continue

                # Dedup checks (failures don't count against retry limit)
                if saying in seen_text:
                    continue

                if dbg:
                    slots_key = (dbg["template_family"], frozenset(dbg["slots"].items()))
                    if slots_key in seen_slots:
                        continue

                    seed_w = _get_seed_word(dbg)
                    if seed_w and seed_usage[seed_w] >= 30:
                        continue
                    if seed_w:
                        seed_usage[seed_w] += 1
                    seen_slots.add(slots_key)

                seen_text.add(saying)
                generated += 1

                if args.json and dbg:
                    record = {
                        "raw_text": saying,
                        "meta_template": dbg["template_family"],
                        "surface_template": dbg["template"],
                        "slots": dbg["slots"],
                        "chain": dbg.get("chain_edges", []),
                    }
                    out.write(json.dumps(record, ensure_ascii=False) + "\n")
                else:
                    out.write(saying + "\n")
                    if args.debug and dbg:
                        out.write(f"  [DEBUG] family={dbg['template_family']}\n")
                        out.write(f"  [DEBUG] chain: {dbg['chain']}\n")
                        out.write(f"  [DEBUG] slots: {dbg['slots']}\n")
                        out.write("\n")
        else:
            # Single generation (no dedup needed)
            saying, dbg = generate_one(
                graph,
                template_id=args.template,
                seed_word=args.seed,
                seed_category=args.category,
                debug=use_debug,
            )
            if saying:
                if args.json and dbg:
                    record = {
                        "raw_text": saying,
                        "meta_template": dbg["template_family"],
                        "surface_template": dbg["template"],
                        "slots": dbg["slots"],
                        "chain": dbg.get("chain_edges", []),
                    }
                    out.write(json.dumps(record, ensure_ascii=False) + "\n")
                else:
                    out.write(saying + "\n")
                    if args.debug and dbg:
                        out.write(f"  [DEBUG] family={dbg['template_family']}\n")
                        out.write(f"  [DEBUG] chain: {dbg['chain']}\n")
                        out.write(f"  [DEBUG] slots: {dbg['slots']}\n")
                        out.write("\n")
            else:
                out.write("(failed to generate saying after retries)\n")
    finally:
        if args.output:
            out.close()


if __name__ == "__main__":
    main()