#!/usr/bin/env python3 """Expand folksy vocabulary with high-quality candidates from LLM suggestions. Reads candidate_additions.csv (words suggested by the LLM during phase 1 that weren't in the vocab), filters for quality, uses the LLM to assign categories, and appends the survivors to folksy_vocab.csv. After running this, re-run `enhance_graph.py --phase 1` to generate edges for the new words (the checkpoint will skip already-processed words). Usage: python scripts/expand_vocab.py # Full run python scripts/expand_vocab.py --dry-run # Show what would be added python scripts/expand_vocab.py --min-citations 8 # Stricter threshold """ import argparse import csv import json import re import shutil import sys import time from collections import Counter, defaultdict from datetime import datetime from pathlib import Path SCRIPT_DIR = Path(__file__).parent PROJECT_DIR = SCRIPT_DIR.parent DATA_DIR = PROJECT_DIR / "data" LLM_ENDPOINT = "http://192.168.1.100:8853/v1d/chat/completions" LLM_MODEL = "THUDM-GLM4-32B" VOCAB_CSV = DATA_DIR / "folksy_vocab.csv" CANDIDATE_CSV = DATA_DIR / "candidate_additions.csv" # Valid categories from the existing vocabulary VALID_CATEGORIES = { "animal", "beverage", "bird", "building", "clothing", "container", "crop", "fabric", "fish", "flower", "food", "fruit", "furniture", "grain", "herb", "insect", "instrument", "landscape", "material", "metal", "mineral", "organism", "plant", "rock", "seed", "shelter", "spice", "stone", "structure", "tool", "tree", "vegetable", "vehicle", "water", "weapon", "wood", } # --------------------------------------------------------------------------- # Exclusion lists # --------------------------------------------------------------------------- # Abstract concepts, emotions, processes — not concrete enough for folksy vocab EXCLUDE_ABSTRACT = { "ecosystem", "satisfaction", "fullness", "warmth", "fear", "relaxation", "growth", "interest", "nature", "protection", "digestion", "injury", "decoration", "construction", "landscape", "noise", "sound", "energy", "nourishment", "nutrition", "pollination", "sustainability", "tradition", "biodiversity", "symbolism", "elegance", "resilience", "patience", "beauty", "abundance", "fertility", "creativity", "harmony", "comfort", "curiosity", "companionship", "loyalty", "aggression", "alertness", "camouflage", "predation", "migration", "hibernation", "decomposition", "erosion", "combustion", "fermentation", "oxidation", "corrosion", "photosynthesis", "respiration", "evaporation", "precipitation", "transpiration", "germination", "excitement", "enjoyment", "satiety", "stability", "organization", "fragrance", "moisture", "wildlife", "preservation", "conversation", "inspiration", "storage", "observation", "hydration", "destruction", "entertainment", "education", "knowledge", "safety", "practice", "research", "skill", "space", "license", "collection", "habitat", "pollution", "health", "vibration", "wonder", "awe", "refreshment", "irritation", "happiness", "joy", "damage", "death", "pain", "thirst", "fear", "alarm", "contents", "ingredients", "electricity", "oxygen", "navigation", "recreation", "meditation", "nutrition", "celebration", "communication", "imagination", "devotion", "ambition", "endurance", "independence", "discipline", "cooperation", "sweetness", "fullness", "aroma", "flavor", "fragrance", "texture", "smell", "color", "contents", "surface", "bottom", "edge", "nutrients", "study", "outfit", "upholstery", } # Scientific/technical — not folksy enough for folk wisdom EXCLUDE_TECHNICAL = { "cellulose", "exoskeleton", "protein", "tissue", "cells", "alloy", "cellulose", "enzyme", "chlorophyll", "genome", "photon", "organism", "molecule", "compound", "polymer", "isotope", "ecosystem", "metabolism", "catalyst", "membrane", "chromosome", "cell", "nutrient", "ingredient", "material", "content", } # Collective/institutional nouns — not concrete individual things EXCLUDE_INSTITUTIONAL = { "orchestra", "fleet", "arsenal", "toolkit", "collection", "restaurant", "museum", "university", "corporation", "organization", "musician", "breakfast", "dinner", "meal", "dish", "sandwich", "seafood", "refrigerator", "garage", "basement", "park", } # Adjectives and properties — useful as HasProperty targets but not as vocab words EXCLUDE_ADJECTIVES = { "small", "large", "heavy", "colorful", "green", "brown", "hard", "white", "round", "sharp", "sturdy", "long", "soft", "flat", "sweet", "bitter", "smooth", "rough", "bright", "dark", "dry", "wet", "thick", "thin", "warm", "cold", "hot", "tall", "short", "red", "blue", "yellow", "black", "grey", "gray", "pink", "fragrant", "loud", "spicy", "sour", "tough", "delicate", "strong", "weak", "light", "dense", "portable", "lightweight", "transparent", "opaque", "flexible", "rigid", "brittle", "elastic", "porous", "compact", "edible", "toxic", "aromatic", "nocturnal", "aquatic", "durable", "cylindrical", "wooden", "shiny", "solid", "narrow", "metallic", "pungent", "juicy", "fast", "powerful", "woody", "fibrous", "savory", "liquid", "enclosed", "rectangular", "wild", "feathered", "leafy", "crunchy", "dangerous", "fuzzy", "slimy", "natural", "waterproof", "electronic", } # Words that are clearly verbs or gerunds EXCLUDE_VERBS = { "eating", "cooking", "growing", "fishing", "hunting", "flying", "mining", "flavoring", "singing", "blooming", "holding", "baking", "ripening", "opening", "cutting", "protecting", "seasoning", "storing", "building", "swimming", "brewing", "weaving", "carving", "climbing", "digging", "plowing", "sewing", "spinning", "tanning", "swim", "run", "grow", "eat", "hunt", "peck", "bite", "dive", "crawl", "cut", "shine", "sparkle", } def singularize(word): """Best-effort singularization. Returns (singular, was_plural).""" # Irregular plurals irregulars = { "teeth": "tooth", "feet": "foot", "geese": "goose", "mice": "mouse", "lice": "louse", "dice": "die", "oxen": "ox", "children": "child", "leaves": "leaf", "loaves": "loaf", "halves": "half", "knives": "knife", "lives": "life", "wives": "wife", "wolves": "wolf", "shelves": "shelf", "calves": "calf", } if word in irregulars: return irregulars[word], True # -ves -> -f (already covered some above, catch remaining) if word.endswith("ves"): candidate = word[:-3] + "f" return candidate, True # -ies -> -y if word.endswith("ies") and len(word) > 4: return word[:-3] + "y", True # -ses, -xes, -zes, -ches, -shes -> drop -es if word.endswith(("ses", "xes", "zes", "ches", "shes")): return word[:-2], True # -s (but not -ss, -us, -is) if word.endswith("s") and not word.endswith(("ss", "us", "is")): return word[:-1], True return word, False def is_plural_of_existing(word, existing_vocab): """Check if word is likely a plural form of an existing vocab word.""" # word + s if word.endswith("s") and word[:-1] in existing_vocab: return True # word + es if word.endswith("es") and word[:-2] in existing_vocab: return True # word ending ies -> y if word.endswith("ies") and word[:-3] + "y" in existing_vocab: return True # word ending ves -> f/fe if word.endswith("ves"): if word[:-3] + "f" in existing_vocab: return True if word[:-3] + "fe" in existing_vocab: return True return False def is_plural_of_candidate(word, accepted_words): """Check if word is a plural of another candidate, or vice versa.""" # Is this word a plural of something accepted? if word.endswith("s") and word[:-1] in accepted_words: return True if word.endswith("es") and word[:-2] in accepted_words: return True if word.endswith("ies") and word[:-3] + "y" in accepted_words: return True # Is something accepted a plural of this word? if word + "s" in accepted_words: return True if word + "es" in accepted_words: return True if word.endswith("f") and word[:-1] + "ves" in accepted_words: return True if word.endswith("fe") and word[:-2] + "ves" in accepted_words: return True return False # --------------------------------------------------------------------------- # LLM categorization # --------------------------------------------------------------------------- CATEGORIZE_SYSTEM = """You are a vocabulary categorizer. Given a list of concrete nouns, assign each one to one or more categories from this fixed list: animal, beverage, bird, building, clothing, container, crop, fabric, fish, flower, food, fruit, furniture, grain, herb, insect, instrument, landscape, material, metal, mineral, organism, plant, rock, seed, shelter, spice, stone, structure, tool, tree, vegetable, vehicle, water, weapon, wood Rules: - Use ONLY categories from the list above - A word can have multiple categories (e.g., "brick" -> material, stone) - If a word fits none of the categories well, output SKIP - Output format: word: category1, category2 - One word per line""" CATEGORIZE_USER = """Categorize these words: {word_list}""" def llm_chat_completion(messages, max_retries=3): """Chat completion with retry logic.""" import requests for attempt in range(max_retries): try: resp = requests.post(LLM_ENDPOINT, json={ "model": LLM_MODEL, "messages": messages, }, timeout=120) resp.raise_for_status() data = resp.json() return data["choices"][0]["message"]["content"] except Exception as e: wait = (2 ** attempt) print(f" LLM call failed (attempt {attempt+1}/{max_retries}): {e}", file=sys.stderr) if attempt < max_retries - 1: print(f" Retrying in {wait}s...", file=sys.stderr) time.sleep(wait) else: print(f" Giving up on this batch.", file=sys.stderr) return None def parse_categories(response_text, valid_words): """Parse LLM categorization response.""" result = {} if not response_text: return result for line in response_text.strip().split("\n"): line = line.strip() if not line: continue # Strip bullets/numbers line = re.sub(r"^[\d]+[.)]\s*", "", line) line = re.sub(r"^[-*•]\s*", "", line) line = line.strip() # Match: word: cat1, cat2 match = re.match(r"^(\w+)\s*:\s*(.+)$", line) if not match: continue word = match.group(1).strip().lower() cats_raw = match.group(2).strip() if "SKIP" in cats_raw.upper(): continue cats = [] for c in cats_raw.split(","): c = c.strip().lower() if c in VALID_CATEGORIES: cats.append(c) if word in valid_words and cats: result[word] = cats return result def categorize_words(words, batch_size=25): """Categorize words using the LLM in batches.""" all_categories = {} word_set = set(words) for i in range(0, len(words), batch_size): batch = words[i:i + batch_size] word_list = "\n".join(f"- {w}" for w in batch) messages = [ {"role": "system", "content": CATEGORIZE_SYSTEM}, {"role": "user", "content": CATEGORIZE_USER.format(word_list=word_list)}, ] response = llm_chat_completion(messages) parsed = parse_categories(response, word_set) all_categories.update(parsed) categorized = len(parsed) print(f" Batch {i // batch_size + 1}: {categorized}/{len(batch)} categorized") time.sleep(0.1) return all_categories # --------------------------------------------------------------------------- # Main # --------------------------------------------------------------------------- def main(): parser = argparse.ArgumentParser( description="Expand folksy vocabulary with LLM-suggested candidates." ) parser.add_argument("--min-citations", type=int, default=5, help="Minimum number of vocab words that suggested this candidate (default: 5)") parser.add_argument("--dry-run", action="store_true", help="Show what would be added without modifying files") parser.add_argument("--no-llm", action="store_true", help="Skip LLM categorization (use placeholder categories)") args = parser.parse_args() # Load existing vocab existing_vocab = {} with open(VOCAB_CSV, newline="", encoding="utf-8") as f: for row in csv.DictReader(f): existing_vocab[row["word"]] = row existing_words = set(existing_vocab.keys()) print(f"Existing vocabulary: {len(existing_words)} words") # Load candidates candidates = [] with open(CANDIDATE_CSV, newline="", encoding="utf-8") as f: for row in csv.DictReader(f): candidates.append(row) # Aggregate: count unique sources per candidate word word_sources = defaultdict(set) for c in candidates: word_sources[c["word"]].add(c["suggested_by"]) print(f"Total candidate rows: {len(candidates)}") print(f"Unique candidate words: {len(word_sources)}") # Normalize plurals: merge citation counts into singular forms normalized_sources = defaultdict(set) for word, sources in word_sources.items(): singular, was_plural = singularize(word) # Merge into the singular form normalized_sources[singular].update(sources) # Replace word_sources with normalized version word_sources = {w: srcs for w, srcs in normalized_sources.items()} print(f"After singularization: {len(word_sources)} unique candidates") # Filter accepted = [] reject_reasons = Counter() # Sort by citation count descending for consistent ordering sorted_candidates = sorted(word_sources.items(), key=lambda x: len(x[1]), reverse=True) accepted_set = set() for word, sources in sorted_candidates: citation_count = len(sources) # Minimum citation threshold if citation_count < args.min_citations: reject_reasons["below_threshold"] += 1 continue # No multi-word (underscore) candidates if "_" in word: reject_reasons["multi_word"] += 1 continue # Already in vocab if word in existing_words: reject_reasons["already_in_vocab"] += 1 continue # Exclude abstracts if word in EXCLUDE_ABSTRACT: reject_reasons["abstract"] += 1 continue # Exclude adjectives if word in EXCLUDE_ADJECTIVES: reject_reasons["adjective"] += 1 continue # Exclude verbs/gerunds if word in EXCLUDE_VERBS: reject_reasons["verb_gerund"] += 1 continue # Exclude technical/scientific if word in EXCLUDE_TECHNICAL: reject_reasons["technical"] += 1 continue # Exclude institutional/collective if word in EXCLUDE_INSTITUTIONAL: reject_reasons["institutional"] += 1 continue # Gerund pattern catch-all (but allow exceptions) if word.endswith("ing") and word not in {"ring", "spring", "string", "wing", "ceiling"}: reject_reasons["gerund_pattern"] += 1 continue # Exclude plurals of existing vocab if is_plural_of_existing(word, existing_words): reject_reasons["plural_of_existing"] += 1 continue # Exclude plurals of already-accepted candidates if is_plural_of_candidate(word, accepted_set): reject_reasons["plural_of_candidate"] += 1 continue # Single character if len(word) < 2: reject_reasons["too_short"] += 1 continue accepted.append((word, citation_count)) accepted_set.add(word) print(f"\nFiltering results:") print(f" Accepted: {len(accepted)}") for reason, count in reject_reasons.most_common(): print(f" Rejected ({reason}): {count}") if not accepted: print("\nNo candidates passed filtering.") return # Show accepted words print(f"\nAccepted candidates ({len(accepted)}):") for word, count in accepted: print(f" {word:25s} cited by {count:3d} vocab words") if args.dry_run: print(f"\nDry run complete. Would add {len(accepted)} words to vocabulary.") return # Categorize with LLM words_to_categorize = [w for w, _ in accepted] if args.no_llm: print("\nSkipping LLM categorization (--no-llm). Using 'material' as placeholder.") categories = {w: ["material"] for w in words_to_categorize} else: print(f"\nCategorizing {len(words_to_categorize)} words with LLM...") categories = categorize_words(words_to_categorize) # Words the LLM couldn't categorize get skipped uncategorized = [w for w in words_to_categorize if w not in categories] if uncategorized: print(f"\n {len(uncategorized)} words could not be categorized (skipped):") for w in uncategorized: print(f" {w}") # Build new vocab entries new_entries = [] for word, citation_count in accepted: if word not in categories: continue cats = categories[word] new_entries.append({ "word": word, "categories": ",".join(cats), "tangibility_score": "0.80", "conceptnet_edge_count": "0", "frequency_rank": "0", }) if not new_entries: print("\nNo entries to add after categorization.") return # Backup existing vocab backup_path = VOCAB_CSV.with_suffix(f".csv.bak.{datetime.now().strftime('%Y%m%d_%H%M%S')}") shutil.copy2(VOCAB_CSV, backup_path) print(f"\nBacked up vocabulary to {backup_path.name}") # Append to vocab CSV with open(VOCAB_CSV, "a", newline="", encoding="utf-8") as f: writer = csv.DictWriter(f, fieldnames=["word", "categories", "tangibility_score", "conceptnet_edge_count", "frequency_rank"]) for entry in new_entries: writer.writerow(entry) print(f"\nAdded {len(new_entries)} words to {VOCAB_CSV.name}") print(f"New vocabulary size: {len(existing_words) + len(new_entries)}") # Summary by category cat_counts = Counter() for entry in new_entries: for c in entry["categories"].split(","): cat_counts[c.strip()] += 1 print(f"\nNew words by category:") for cat, count in cat_counts.most_common(): print(f" {cat:20s} {count:3d}") print(f"\nNext step: run 'python scripts/enhance_graph.py --phase 1' to generate edges for new words.") if __name__ == "__main__": main()