folksy_idioms/scripts/expand_vocab.py

#!/usr/bin/env python3
"""Expand folksy vocabulary with high-quality candidates from LLM suggestions.

Reads candidate_additions.csv (words suggested by the LLM during phase 1 that
weren't in the vocab), filters for quality, uses the LLM to assign categories,
and appends the survivors to folksy_vocab.csv.

After running this, re-run `enhance_graph.py --phase 1` to generate edges
for the new words (the checkpoint will skip already-processed words).

Usage:
  python scripts/expand_vocab.py                  # Full run
  python scripts/expand_vocab.py --dry-run         # Show what would be added
  python scripts/expand_vocab.py --min-citations 8 # Stricter threshold
"""

import argparse
import csv
import json
import re
import shutil
import sys
import time
from collections import Counter, defaultdict
from datetime import datetime
from pathlib import Path

SCRIPT_DIR = Path(__file__).parent
PROJECT_DIR = SCRIPT_DIR.parent
DATA_DIR = PROJECT_DIR / "data"

LLM_ENDPOINT = "http://192.168.1.100:8853/v1d/chat/completions"
LLM_MODEL = "THUDM-GLM4-32B"

VOCAB_CSV = DATA_DIR / "folksy_vocab.csv"
CANDIDATE_CSV = DATA_DIR / "candidate_additions.csv"

# Valid categories from the existing vocabulary
VALID_CATEGORIES = {
    "animal", "beverage", "bird", "building", "clothing", "container", "crop",
    "fabric", "fish", "flower", "food", "fruit", "furniture", "grain", "herb",
    "insect", "instrument", "landscape", "material", "metal", "mineral",
    "organism", "plant", "rock", "seed", "shelter", "spice", "stone",
    "structure", "tool", "tree", "vegetable", "vehicle", "water", "weapon", "wood",
}

# ---------------------------------------------------------------------------
# Exclusion lists
# ---------------------------------------------------------------------------

# Abstract concepts, emotions, processes — not concrete enough for folksy vocab
EXCLUDE_ABSTRACT = {
    "ecosystem", "satisfaction", "fullness", "warmth", "fear", "relaxation",
    "growth", "interest", "nature", "protection", "digestion", "injury",
    "decoration", "construction", "landscape", "noise", "sound", "energy",
    "nourishment", "nutrition", "pollination", "sustainability", "tradition",
    "biodiversity", "symbolism", "elegance", "resilience", "patience",
    "beauty", "abundance", "fertility", "creativity", "harmony", "comfort",
    "curiosity", "companionship", "loyalty", "aggression", "alertness",
    "camouflage", "predation", "migration", "hibernation", "decomposition",
    "erosion", "combustion", "fermentation", "oxidation", "corrosion",
    "photosynthesis", "respiration", "evaporation", "precipitation",
    "transpiration", "germination", "excitement", "enjoyment", "satiety",
    "stability", "organization", "fragrance", "moisture", "wildlife",
    "preservation", "conversation", "inspiration", "storage", "observation",
    "hydration", "destruction", "entertainment", "education", "knowledge",
    "safety", "practice", "research", "skill", "space", "license",
    "collection", "habitat", "pollution", "health", "vibration", "wonder",
    "awe", "refreshment", "irritation", "happiness", "joy", "damage",
    "death", "pain", "thirst", "fear", "alarm", "contents", "ingredients",
    "electricity", "oxygen", "navigation", "recreation", "meditation",
    "nutrition", "celebration", "communication", "imagination", "devotion",
    "ambition", "endurance", "independence", "discipline", "cooperation",
    "sweetness", "fullness", "aroma", "flavor", "fragrance", "texture",
    "smell", "color", "contents", "surface", "bottom", "edge",
    "nutrients", "study", "outfit", "upholstery",
}

# Scientific/technical — not folksy enough for folk wisdom
EXCLUDE_TECHNICAL = {
    "cellulose", "exoskeleton", "protein", "tissue", "cells", "alloy",
    "cellulose", "enzyme", "chlorophyll", "genome", "photon",
    "organism", "molecule", "compound", "polymer", "isotope",
    "ecosystem", "metabolism", "catalyst", "membrane", "chromosome",
    "cell", "nutrient", "ingredient", "material", "content",
}

# Collective/institutional nouns — not concrete individual things
EXCLUDE_INSTITUTIONAL = {
    "orchestra", "fleet", "arsenal", "toolkit", "collection",
    "restaurant", "museum", "university", "corporation", "organization",
    "musician", "breakfast", "dinner", "meal", "dish", "sandwich",
    "seafood", "refrigerator", "garage", "basement", "park",
}

# Adjectives and properties — useful as HasProperty targets but not as vocab words
EXCLUDE_ADJECTIVES = {
    "small", "large", "heavy", "colorful", "green", "brown", "hard",
    "white", "round", "sharp", "sturdy", "long", "soft", "flat",
    "sweet", "bitter", "smooth", "rough", "bright", "dark", "dry",
    "wet", "thick", "thin", "warm", "cold", "hot", "tall", "short",
    "red", "blue", "yellow", "black", "grey", "gray", "pink",
    "fragrant", "loud", "spicy", "sour", "tough", "delicate", "strong",
    "weak", "light", "dense", "portable", "lightweight", "transparent",
    "opaque", "flexible", "rigid", "brittle", "elastic", "porous",
    "compact", "edible", "toxic", "aromatic", "nocturnal", "aquatic",
    "durable", "cylindrical", "wooden", "shiny", "solid", "narrow",
    "metallic", "pungent", "juicy", "fast", "powerful", "woody",
    "fibrous", "savory", "liquid", "enclosed", "rectangular", "wild",
    "feathered", "leafy", "crunchy", "dangerous", "fuzzy", "slimy",
    "natural", "waterproof", "electronic",
}

# Words that are clearly verbs or gerunds
EXCLUDE_VERBS = {
    "eating", "cooking", "growing", "fishing", "hunting", "flying",
    "mining", "flavoring", "singing", "blooming", "holding", "baking",
    "ripening", "opening", "cutting", "protecting", "seasoning",
    "storing", "building", "swimming", "brewing", "weaving", "carving",
    "climbing", "digging", "plowing", "sewing", "spinning", "tanning",
    "swim", "run", "grow", "eat", "hunt", "peck", "bite", "dive",
    "crawl", "cut", "shine", "sparkle",
}


def singularize(word):
    """Best-effort singularization. Returns (singular, was_plural)."""
    # Irregular plurals
    irregulars = {
        "teeth": "tooth", "feet": "foot", "geese": "goose", "mice": "mouse",
        "lice": "louse", "dice": "die", "oxen": "ox", "children": "child",
        "leaves": "leaf", "loaves": "loaf", "halves": "half", "knives": "knife",
        "lives": "life", "wives": "wife", "wolves": "wolf", "shelves": "shelf",
        "calves": "calf",
    }
    if word in irregulars:
        return irregulars[word], True

    # -ves -> -f (already covered some above, catch remaining)
    if word.endswith("ves"):
        candidate = word[:-3] + "f"
        return candidate, True

    # -ies -> -y
    if word.endswith("ies") and len(word) > 4:
        return word[:-3] + "y", True

    # -ses, -xes, -zes, -ches, -shes -> drop -es
    if word.endswith(("ses", "xes", "zes", "ches", "shes")):
        return word[:-2], True

    # -s (but not -ss, -us, -is)
    if word.endswith("s") and not word.endswith(("ss", "us", "is")):
        return word[:-1], True

    return word, False


def is_plural_of_existing(word, existing_vocab):
    """Check if word is likely a plural form of an existing vocab word."""
    # word + s
    if word.endswith("s") and word[:-1] in existing_vocab:
        return True
    # word + es
    if word.endswith("es") and word[:-2] in existing_vocab:
        return True
    # word ending ies -> y
    if word.endswith("ies") and word[:-3] + "y" in existing_vocab:
        return True
    # word ending ves -> f/fe
    if word.endswith("ves"):
        if word[:-3] + "f" in existing_vocab:
            return True
        if word[:-3] + "fe" in existing_vocab:
            return True
    return False


def is_plural_of_candidate(word, accepted_words):
    """Check if word is a plural of another candidate, or vice versa."""
    # Is this word a plural of something accepted?
    if word.endswith("s") and word[:-1] in accepted_words:
        return True
    if word.endswith("es") and word[:-2] in accepted_words:
        return True
    if word.endswith("ies") and word[:-3] + "y" in accepted_words:
        return True
    # Is something accepted a plural of this word?
    if word + "s" in accepted_words:
        return True
    if word + "es" in accepted_words:
        return True
    if word.endswith("f") and word[:-1] + "ves" in accepted_words:
        return True
    if word.endswith("fe") and word[:-2] + "ves" in accepted_words:
        return True
    return False


# ---------------------------------------------------------------------------
# LLM categorization
# ---------------------------------------------------------------------------

CATEGORIZE_SYSTEM = """You are a vocabulary categorizer. Given a list of concrete nouns, assign each one to one or more categories from this fixed list:

animal, beverage, bird, building, clothing, container, crop, fabric, fish, flower, food, fruit, furniture, grain, herb, insect, instrument, landscape, material, metal, mineral, organism, plant, rock, seed, shelter, spice, stone, structure, tool, tree, vegetable, vehicle, water, weapon, wood

Rules:
- Use ONLY categories from the list above
- A word can have multiple categories (e.g., "brick" -> material, stone)
- If a word fits none of the categories well, output SKIP
- Output format: word: category1, category2
- One word per line"""

CATEGORIZE_USER = """Categorize these words:
{word_list}"""


def llm_chat_completion(messages, max_retries=3):
    """Chat completion with retry logic."""
    import requests

    for attempt in range(max_retries):
        try:
            resp = requests.post(LLM_ENDPOINT, json={
                "model": LLM_MODEL,
                "messages": messages,
            }, timeout=120)
            resp.raise_for_status()
            data = resp.json()
            return data["choices"][0]["message"]["content"]
        except Exception as e:
            wait = (2 ** attempt)
            print(f"  LLM call failed (attempt {attempt+1}/{max_retries}): {e}",
                  file=sys.stderr)
            if attempt < max_retries - 1:
                print(f"  Retrying in {wait}s...", file=sys.stderr)
                time.sleep(wait)
            else:
                print(f"  Giving up on this batch.", file=sys.stderr)
                return None


def parse_categories(response_text, valid_words):
    """Parse LLM categorization response."""
    result = {}
    if not response_text:
        return result

    for line in response_text.strip().split("\n"):
        line = line.strip()
        if not line:
            continue

        # Strip bullets/numbers
        line = re.sub(r"^[\d]+[.)]\s*", "", line)
        line = re.sub(r"^[-*•]\s*", "", line)
        line = line.strip()

        # Match: word: cat1, cat2
        match = re.match(r"^(\w+)\s*:\s*(.+)$", line)
        if not match:
            continue

        word = match.group(1).strip().lower()
        cats_raw = match.group(2).strip()

        if "SKIP" in cats_raw.upper():
            continue

        cats = []
        for c in cats_raw.split(","):
            c = c.strip().lower()
            if c in VALID_CATEGORIES:
                cats.append(c)

        if word in valid_words and cats:
            result[word] = cats

    return result


def categorize_words(words, batch_size=25):
    """Categorize words using the LLM in batches."""
    all_categories = {}
    word_set = set(words)

    for i in range(0, len(words), batch_size):
        batch = words[i:i + batch_size]
        word_list = "\n".join(f"- {w}" for w in batch)

        messages = [
            {"role": "system", "content": CATEGORIZE_SYSTEM},
            {"role": "user", "content": CATEGORIZE_USER.format(word_list=word_list)},
        ]

        response = llm_chat_completion(messages)
        parsed = parse_categories(response, word_set)
        all_categories.update(parsed)

        categorized = len(parsed)
        print(f"  Batch {i // batch_size + 1}: {categorized}/{len(batch)} categorized")
        time.sleep(0.1)

    return all_categories


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------

def main():
    parser = argparse.ArgumentParser(
        description="Expand folksy vocabulary with LLM-suggested candidates."
    )
    parser.add_argument("--min-citations", type=int, default=5,
                        help="Minimum number of vocab words that suggested this candidate (default: 5)")
    parser.add_argument("--dry-run", action="store_true",
                        help="Show what would be added without modifying files")
    parser.add_argument("--no-llm", action="store_true",
                        help="Skip LLM categorization (use placeholder categories)")

    args = parser.parse_args()

    # Load existing vocab
    existing_vocab = {}
    with open(VOCAB_CSV, newline="", encoding="utf-8") as f:
        for row in csv.DictReader(f):
            existing_vocab[row["word"]] = row
    existing_words = set(existing_vocab.keys())
    print(f"Existing vocabulary: {len(existing_words)} words")

    # Load candidates
    candidates = []
    with open(CANDIDATE_CSV, newline="", encoding="utf-8") as f:
        for row in csv.DictReader(f):
            candidates.append(row)

    # Aggregate: count unique sources per candidate word
    word_sources = defaultdict(set)
    for c in candidates:
        word_sources[c["word"]].add(c["suggested_by"])

    print(f"Total candidate rows: {len(candidates)}")
    print(f"Unique candidate words: {len(word_sources)}")

    # Normalize plurals: merge citation counts into singular forms
    normalized_sources = defaultdict(set)
    for word, sources in word_sources.items():
        singular, was_plural = singularize(word)
        # Merge into the singular form
        normalized_sources[singular].update(sources)
    # Replace word_sources with normalized version
    word_sources = {w: srcs for w, srcs in normalized_sources.items()}
    print(f"After singularization: {len(word_sources)} unique candidates")

    # Filter
    accepted = []
    reject_reasons = Counter()

    # Sort by citation count descending for consistent ordering
    sorted_candidates = sorted(word_sources.items(), key=lambda x: len(x[1]), reverse=True)
    accepted_set = set()

    for word, sources in sorted_candidates:
        citation_count = len(sources)

        # Minimum citation threshold
        if citation_count < args.min_citations:
            reject_reasons["below_threshold"] += 1
            continue

        # No multi-word (underscore) candidates
        if "_" in word:
            reject_reasons["multi_word"] += 1
            continue

        # Already in vocab
        if word in existing_words:
            reject_reasons["already_in_vocab"] += 1
            continue

        # Exclude abstracts
        if word in EXCLUDE_ABSTRACT:
            reject_reasons["abstract"] += 1
            continue

        # Exclude adjectives
        if word in EXCLUDE_ADJECTIVES:
            reject_reasons["adjective"] += 1
            continue

        # Exclude verbs/gerunds
        if word in EXCLUDE_VERBS:
            reject_reasons["verb_gerund"] += 1
            continue

        # Exclude technical/scientific
        if word in EXCLUDE_TECHNICAL:
            reject_reasons["technical"] += 1
            continue

        # Exclude institutional/collective
        if word in EXCLUDE_INSTITUTIONAL:
            reject_reasons["institutional"] += 1
            continue

        # Gerund pattern catch-all (but allow exceptions)
        if word.endswith("ing") and word not in {"ring", "spring", "string", "wing", "ceiling"}:
            reject_reasons["gerund_pattern"] += 1
            continue

        # Exclude plurals of existing vocab
        if is_plural_of_existing(word, existing_words):
            reject_reasons["plural_of_existing"] += 1
            continue

        # Exclude plurals of already-accepted candidates
        if is_plural_of_candidate(word, accepted_set):
            reject_reasons["plural_of_candidate"] += 1
            continue

        # Single character
        if len(word) < 2:
            reject_reasons["too_short"] += 1
            continue

        accepted.append((word, citation_count))
        accepted_set.add(word)

    print(f"\nFiltering results:")
    print(f"  Accepted: {len(accepted)}")
    for reason, count in reject_reasons.most_common():
        print(f"  Rejected ({reason}): {count}")

    if not accepted:
        print("\nNo candidates passed filtering.")
        return

    # Show accepted words
    print(f"\nAccepted candidates ({len(accepted)}):")
    for word, count in accepted:
        print(f"  {word:25s} cited by {count:3d} vocab words")

    if args.dry_run:
        print(f"\nDry run complete. Would add {len(accepted)} words to vocabulary.")
        return

    # Categorize with LLM
    words_to_categorize = [w for w, _ in accepted]

    if args.no_llm:
        print("\nSkipping LLM categorization (--no-llm). Using 'material' as placeholder.")
        categories = {w: ["material"] for w in words_to_categorize}
    else:
        print(f"\nCategorizing {len(words_to_categorize)} words with LLM...")
        categories = categorize_words(words_to_categorize)

    # Words the LLM couldn't categorize get skipped
    uncategorized = [w for w in words_to_categorize if w not in categories]
    if uncategorized:
        print(f"\n  {len(uncategorized)} words could not be categorized (skipped):")
        for w in uncategorized:
            print(f"    {w}")

    # Build new vocab entries
    new_entries = []
    for word, citation_count in accepted:
        if word not in categories:
            continue
        cats = categories[word]
        new_entries.append({
            "word": word,
            "categories": ",".join(cats),
            "tangibility_score": "0.80",
            "conceptnet_edge_count": "0",
            "frequency_rank": "0",
        })

    if not new_entries:
        print("\nNo entries to add after categorization.")
        return

    # Backup existing vocab
    backup_path = VOCAB_CSV.with_suffix(f".csv.bak.{datetime.now().strftime('%Y%m%d_%H%M%S')}")
    shutil.copy2(VOCAB_CSV, backup_path)
    print(f"\nBacked up vocabulary to {backup_path.name}")

    # Append to vocab CSV
    with open(VOCAB_CSV, "a", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["word", "categories", "tangibility_score",
                                                "conceptnet_edge_count", "frequency_rank"])
        for entry in new_entries:
            writer.writerow(entry)

    print(f"\nAdded {len(new_entries)} words to {VOCAB_CSV.name}")
    print(f"New vocabulary size: {len(existing_words) + len(new_entries)}")

    # Summary by category
    cat_counts = Counter()
    for entry in new_entries:
        for c in entry["categories"].split(","):
            cat_counts[c.strip()] += 1
    print(f"\nNew words by category:")
    for cat, count in cat_counts.most_common():
        print(f"  {cat:20s} {count:3d}")

    print(f"\nNext step: run 'python scripts/enhance_graph.py --phase 1' to generate edges for new words.")


if __name__ == "__main__":
    main()