folksy_idioms/scripts/expand_vocab.py

512 lines
19 KiB
Python

#!/usr/bin/env python3
"""Expand folksy vocabulary with high-quality candidates from LLM suggestions.
Reads candidate_additions.csv (words suggested by the LLM during phase 1 that
weren't in the vocab), filters for quality, uses the LLM to assign categories,
and appends the survivors to folksy_vocab.csv.
After running this, re-run `enhance_graph.py --phase 1` to generate edges
for the new words (the checkpoint will skip already-processed words).
Usage:
python scripts/expand_vocab.py # Full run
python scripts/expand_vocab.py --dry-run # Show what would be added
python scripts/expand_vocab.py --min-citations 8 # Stricter threshold
"""
import argparse
import csv
import json
import re
import shutil
import sys
import time
from collections import Counter, defaultdict
from datetime import datetime
from pathlib import Path
SCRIPT_DIR = Path(__file__).parent
PROJECT_DIR = SCRIPT_DIR.parent
DATA_DIR = PROJECT_DIR / "data"
LLM_ENDPOINT = "http://192.168.1.100:8853/v1d/chat/completions"
LLM_MODEL = "THUDM-GLM4-32B"
VOCAB_CSV = DATA_DIR / "folksy_vocab.csv"
CANDIDATE_CSV = DATA_DIR / "candidate_additions.csv"
# Valid categories from the existing vocabulary
VALID_CATEGORIES = {
"animal", "beverage", "bird", "building", "clothing", "container", "crop",
"fabric", "fish", "flower", "food", "fruit", "furniture", "grain", "herb",
"insect", "instrument", "landscape", "material", "metal", "mineral",
"organism", "plant", "rock", "seed", "shelter", "spice", "stone",
"structure", "tool", "tree", "vegetable", "vehicle", "water", "weapon", "wood",
}
# ---------------------------------------------------------------------------
# Exclusion lists
# ---------------------------------------------------------------------------
# Abstract concepts, emotions, processes — not concrete enough for folksy vocab
EXCLUDE_ABSTRACT = {
"ecosystem", "satisfaction", "fullness", "warmth", "fear", "relaxation",
"growth", "interest", "nature", "protection", "digestion", "injury",
"decoration", "construction", "landscape", "noise", "sound", "energy",
"nourishment", "nutrition", "pollination", "sustainability", "tradition",
"biodiversity", "symbolism", "elegance", "resilience", "patience",
"beauty", "abundance", "fertility", "creativity", "harmony", "comfort",
"curiosity", "companionship", "loyalty", "aggression", "alertness",
"camouflage", "predation", "migration", "hibernation", "decomposition",
"erosion", "combustion", "fermentation", "oxidation", "corrosion",
"photosynthesis", "respiration", "evaporation", "precipitation",
"transpiration", "germination", "excitement", "enjoyment", "satiety",
"stability", "organization", "fragrance", "moisture", "wildlife",
"preservation", "conversation", "inspiration", "storage", "observation",
"hydration", "destruction", "entertainment", "education", "knowledge",
"safety", "practice", "research", "skill", "space", "license",
"collection", "habitat", "pollution", "health", "vibration", "wonder",
"awe", "refreshment", "irritation", "happiness", "joy", "damage",
"death", "pain", "thirst", "fear", "alarm", "contents", "ingredients",
"electricity", "oxygen", "navigation", "recreation", "meditation",
"nutrition", "celebration", "communication", "imagination", "devotion",
"ambition", "endurance", "independence", "discipline", "cooperation",
"sweetness", "fullness", "aroma", "flavor", "fragrance", "texture",
"smell", "color", "contents", "surface", "bottom", "edge",
"nutrients", "study", "outfit", "upholstery",
}
# Scientific/technical — not folksy enough for folk wisdom
EXCLUDE_TECHNICAL = {
"cellulose", "exoskeleton", "protein", "tissue", "cells", "alloy",
"cellulose", "enzyme", "chlorophyll", "genome", "photon",
"organism", "molecule", "compound", "polymer", "isotope",
"ecosystem", "metabolism", "catalyst", "membrane", "chromosome",
"cell", "nutrient", "ingredient", "material", "content",
}
# Collective/institutional nouns — not concrete individual things
EXCLUDE_INSTITUTIONAL = {
"orchestra", "fleet", "arsenal", "toolkit", "collection",
"restaurant", "museum", "university", "corporation", "organization",
"musician", "breakfast", "dinner", "meal", "dish", "sandwich",
"seafood", "refrigerator", "garage", "basement", "park",
}
# Adjectives and properties — useful as HasProperty targets but not as vocab words
EXCLUDE_ADJECTIVES = {
"small", "large", "heavy", "colorful", "green", "brown", "hard",
"white", "round", "sharp", "sturdy", "long", "soft", "flat",
"sweet", "bitter", "smooth", "rough", "bright", "dark", "dry",
"wet", "thick", "thin", "warm", "cold", "hot", "tall", "short",
"red", "blue", "yellow", "black", "grey", "gray", "pink",
"fragrant", "loud", "spicy", "sour", "tough", "delicate", "strong",
"weak", "light", "dense", "portable", "lightweight", "transparent",
"opaque", "flexible", "rigid", "brittle", "elastic", "porous",
"compact", "edible", "toxic", "aromatic", "nocturnal", "aquatic",
"durable", "cylindrical", "wooden", "shiny", "solid", "narrow",
"metallic", "pungent", "juicy", "fast", "powerful", "woody",
"fibrous", "savory", "liquid", "enclosed", "rectangular", "wild",
"feathered", "leafy", "crunchy", "dangerous", "fuzzy", "slimy",
"natural", "waterproof", "electronic",
}
# Words that are clearly verbs or gerunds
EXCLUDE_VERBS = {
"eating", "cooking", "growing", "fishing", "hunting", "flying",
"mining", "flavoring", "singing", "blooming", "holding", "baking",
"ripening", "opening", "cutting", "protecting", "seasoning",
"storing", "building", "swimming", "brewing", "weaving", "carving",
"climbing", "digging", "plowing", "sewing", "spinning", "tanning",
"swim", "run", "grow", "eat", "hunt", "peck", "bite", "dive",
"crawl", "cut", "shine", "sparkle",
}
def singularize(word):
"""Best-effort singularization. Returns (singular, was_plural)."""
# Irregular plurals
irregulars = {
"teeth": "tooth", "feet": "foot", "geese": "goose", "mice": "mouse",
"lice": "louse", "dice": "die", "oxen": "ox", "children": "child",
"leaves": "leaf", "loaves": "loaf", "halves": "half", "knives": "knife",
"lives": "life", "wives": "wife", "wolves": "wolf", "shelves": "shelf",
"calves": "calf",
}
if word in irregulars:
return irregulars[word], True
# -ves -> -f (already covered some above, catch remaining)
if word.endswith("ves"):
candidate = word[:-3] + "f"
return candidate, True
# -ies -> -y
if word.endswith("ies") and len(word) > 4:
return word[:-3] + "y", True
# -ses, -xes, -zes, -ches, -shes -> drop -es
if word.endswith(("ses", "xes", "zes", "ches", "shes")):
return word[:-2], True
# -s (but not -ss, -us, -is)
if word.endswith("s") and not word.endswith(("ss", "us", "is")):
return word[:-1], True
return word, False
def is_plural_of_existing(word, existing_vocab):
"""Check if word is likely a plural form of an existing vocab word."""
# word + s
if word.endswith("s") and word[:-1] in existing_vocab:
return True
# word + es
if word.endswith("es") and word[:-2] in existing_vocab:
return True
# word ending ies -> y
if word.endswith("ies") and word[:-3] + "y" in existing_vocab:
return True
# word ending ves -> f/fe
if word.endswith("ves"):
if word[:-3] + "f" in existing_vocab:
return True
if word[:-3] + "fe" in existing_vocab:
return True
return False
def is_plural_of_candidate(word, accepted_words):
"""Check if word is a plural of another candidate, or vice versa."""
# Is this word a plural of something accepted?
if word.endswith("s") and word[:-1] in accepted_words:
return True
if word.endswith("es") and word[:-2] in accepted_words:
return True
if word.endswith("ies") and word[:-3] + "y" in accepted_words:
return True
# Is something accepted a plural of this word?
if word + "s" in accepted_words:
return True
if word + "es" in accepted_words:
return True
if word.endswith("f") and word[:-1] + "ves" in accepted_words:
return True
if word.endswith("fe") and word[:-2] + "ves" in accepted_words:
return True
return False
# ---------------------------------------------------------------------------
# LLM categorization
# ---------------------------------------------------------------------------
CATEGORIZE_SYSTEM = """You are a vocabulary categorizer. Given a list of concrete nouns, assign each one to one or more categories from this fixed list:
animal, beverage, bird, building, clothing, container, crop, fabric, fish, flower, food, fruit, furniture, grain, herb, insect, instrument, landscape, material, metal, mineral, organism, plant, rock, seed, shelter, spice, stone, structure, tool, tree, vegetable, vehicle, water, weapon, wood
Rules:
- Use ONLY categories from the list above
- A word can have multiple categories (e.g., "brick" -> material, stone)
- If a word fits none of the categories well, output SKIP
- Output format: word: category1, category2
- One word per line"""
CATEGORIZE_USER = """Categorize these words:
{word_list}"""
def llm_chat_completion(messages, max_retries=3):
"""Chat completion with retry logic."""
import requests
for attempt in range(max_retries):
try:
resp = requests.post(LLM_ENDPOINT, json={
"model": LLM_MODEL,
"messages": messages,
}, timeout=120)
resp.raise_for_status()
data = resp.json()
return data["choices"][0]["message"]["content"]
except Exception as e:
wait = (2 ** attempt)
print(f" LLM call failed (attempt {attempt+1}/{max_retries}): {e}",
file=sys.stderr)
if attempt < max_retries - 1:
print(f" Retrying in {wait}s...", file=sys.stderr)
time.sleep(wait)
else:
print(f" Giving up on this batch.", file=sys.stderr)
return None
def parse_categories(response_text, valid_words):
"""Parse LLM categorization response."""
result = {}
if not response_text:
return result
for line in response_text.strip().split("\n"):
line = line.strip()
if not line:
continue
# Strip bullets/numbers
line = re.sub(r"^[\d]+[.)]\s*", "", line)
line = re.sub(r"^[-*•]\s*", "", line)
line = line.strip()
# Match: word: cat1, cat2
match = re.match(r"^(\w+)\s*:\s*(.+)$", line)
if not match:
continue
word = match.group(1).strip().lower()
cats_raw = match.group(2).strip()
if "SKIP" in cats_raw.upper():
continue
cats = []
for c in cats_raw.split(","):
c = c.strip().lower()
if c in VALID_CATEGORIES:
cats.append(c)
if word in valid_words and cats:
result[word] = cats
return result
def categorize_words(words, batch_size=25):
"""Categorize words using the LLM in batches."""
all_categories = {}
word_set = set(words)
for i in range(0, len(words), batch_size):
batch = words[i:i + batch_size]
word_list = "\n".join(f"- {w}" for w in batch)
messages = [
{"role": "system", "content": CATEGORIZE_SYSTEM},
{"role": "user", "content": CATEGORIZE_USER.format(word_list=word_list)},
]
response = llm_chat_completion(messages)
parsed = parse_categories(response, word_set)
all_categories.update(parsed)
categorized = len(parsed)
print(f" Batch {i // batch_size + 1}: {categorized}/{len(batch)} categorized")
time.sleep(0.1)
return all_categories
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main():
parser = argparse.ArgumentParser(
description="Expand folksy vocabulary with LLM-suggested candidates."
)
parser.add_argument("--min-citations", type=int, default=5,
help="Minimum number of vocab words that suggested this candidate (default: 5)")
parser.add_argument("--dry-run", action="store_true",
help="Show what would be added without modifying files")
parser.add_argument("--no-llm", action="store_true",
help="Skip LLM categorization (use placeholder categories)")
args = parser.parse_args()
# Load existing vocab
existing_vocab = {}
with open(VOCAB_CSV, newline="", encoding="utf-8") as f:
for row in csv.DictReader(f):
existing_vocab[row["word"]] = row
existing_words = set(existing_vocab.keys())
print(f"Existing vocabulary: {len(existing_words)} words")
# Load candidates
candidates = []
with open(CANDIDATE_CSV, newline="", encoding="utf-8") as f:
for row in csv.DictReader(f):
candidates.append(row)
# Aggregate: count unique sources per candidate word
word_sources = defaultdict(set)
for c in candidates:
word_sources[c["word"]].add(c["suggested_by"])
print(f"Total candidate rows: {len(candidates)}")
print(f"Unique candidate words: {len(word_sources)}")
# Normalize plurals: merge citation counts into singular forms
normalized_sources = defaultdict(set)
for word, sources in word_sources.items():
singular, was_plural = singularize(word)
# Merge into the singular form
normalized_sources[singular].update(sources)
# Replace word_sources with normalized version
word_sources = {w: srcs for w, srcs in normalized_sources.items()}
print(f"After singularization: {len(word_sources)} unique candidates")
# Filter
accepted = []
reject_reasons = Counter()
# Sort by citation count descending for consistent ordering
sorted_candidates = sorted(word_sources.items(), key=lambda x: len(x[1]), reverse=True)
accepted_set = set()
for word, sources in sorted_candidates:
citation_count = len(sources)
# Minimum citation threshold
if citation_count < args.min_citations:
reject_reasons["below_threshold"] += 1
continue
# No multi-word (underscore) candidates
if "_" in word:
reject_reasons["multi_word"] += 1
continue
# Already in vocab
if word in existing_words:
reject_reasons["already_in_vocab"] += 1
continue
# Exclude abstracts
if word in EXCLUDE_ABSTRACT:
reject_reasons["abstract"] += 1
continue
# Exclude adjectives
if word in EXCLUDE_ADJECTIVES:
reject_reasons["adjective"] += 1
continue
# Exclude verbs/gerunds
if word in EXCLUDE_VERBS:
reject_reasons["verb_gerund"] += 1
continue
# Exclude technical/scientific
if word in EXCLUDE_TECHNICAL:
reject_reasons["technical"] += 1
continue
# Exclude institutional/collective
if word in EXCLUDE_INSTITUTIONAL:
reject_reasons["institutional"] += 1
continue
# Gerund pattern catch-all (but allow exceptions)
if word.endswith("ing") and word not in {"ring", "spring", "string", "wing", "ceiling"}:
reject_reasons["gerund_pattern"] += 1
continue
# Exclude plurals of existing vocab
if is_plural_of_existing(word, existing_words):
reject_reasons["plural_of_existing"] += 1
continue
# Exclude plurals of already-accepted candidates
if is_plural_of_candidate(word, accepted_set):
reject_reasons["plural_of_candidate"] += 1
continue
# Single character
if len(word) < 2:
reject_reasons["too_short"] += 1
continue
accepted.append((word, citation_count))
accepted_set.add(word)
print(f"\nFiltering results:")
print(f" Accepted: {len(accepted)}")
for reason, count in reject_reasons.most_common():
print(f" Rejected ({reason}): {count}")
if not accepted:
print("\nNo candidates passed filtering.")
return
# Show accepted words
print(f"\nAccepted candidates ({len(accepted)}):")
for word, count in accepted:
print(f" {word:25s} cited by {count:3d} vocab words")
if args.dry_run:
print(f"\nDry run complete. Would add {len(accepted)} words to vocabulary.")
return
# Categorize with LLM
words_to_categorize = [w for w, _ in accepted]
if args.no_llm:
print("\nSkipping LLM categorization (--no-llm). Using 'material' as placeholder.")
categories = {w: ["material"] for w in words_to_categorize}
else:
print(f"\nCategorizing {len(words_to_categorize)} words with LLM...")
categories = categorize_words(words_to_categorize)
# Words the LLM couldn't categorize get skipped
uncategorized = [w for w in words_to_categorize if w not in categories]
if uncategorized:
print(f"\n {len(uncategorized)} words could not be categorized (skipped):")
for w in uncategorized:
print(f" {w}")
# Build new vocab entries
new_entries = []
for word, citation_count in accepted:
if word not in categories:
continue
cats = categories[word]
new_entries.append({
"word": word,
"categories": ",".join(cats),
"tangibility_score": "0.80",
"conceptnet_edge_count": "0",
"frequency_rank": "0",
})
if not new_entries:
print("\nNo entries to add after categorization.")
return
# Backup existing vocab
backup_path = VOCAB_CSV.with_suffix(f".csv.bak.{datetime.now().strftime('%Y%m%d_%H%M%S')}")
shutil.copy2(VOCAB_CSV, backup_path)
print(f"\nBacked up vocabulary to {backup_path.name}")
# Append to vocab CSV
with open(VOCAB_CSV, "a", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=["word", "categories", "tangibility_score",
"conceptnet_edge_count", "frequency_rank"])
for entry in new_entries:
writer.writerow(entry)
print(f"\nAdded {len(new_entries)} words to {VOCAB_CSV.name}")
print(f"New vocabulary size: {len(existing_words) + len(new_entries)}")
# Summary by category
cat_counts = Counter()
for entry in new_entries:
for c in entry["categories"].split(","):
cat_counts[c.strip()] += 1
print(f"\nNew words by category:")
for cat, count in cat_counts.most_common():
print(f" {cat:20s} {count:3d}")
print(f"\nNext step: run 'python scripts/enhance_graph.py --phase 1' to generate edges for new words.")
if __name__ == "__main__":
main()