512 lines
19 KiB
Python
512 lines
19 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""Expand folksy vocabulary with high-quality candidates from LLM suggestions.
|
||
|
|
|
||
|
|
Reads candidate_additions.csv (words suggested by the LLM during phase 1 that
|
||
|
|
weren't in the vocab), filters for quality, uses the LLM to assign categories,
|
||
|
|
and appends the survivors to folksy_vocab.csv.
|
||
|
|
|
||
|
|
After running this, re-run `enhance_graph.py --phase 1` to generate edges
|
||
|
|
for the new words (the checkpoint will skip already-processed words).
|
||
|
|
|
||
|
|
Usage:
|
||
|
|
python scripts/expand_vocab.py # Full run
|
||
|
|
python scripts/expand_vocab.py --dry-run # Show what would be added
|
||
|
|
python scripts/expand_vocab.py --min-citations 8 # Stricter threshold
|
||
|
|
"""
|
||
|
|
|
||
|
|
import argparse
|
||
|
|
import csv
|
||
|
|
import json
|
||
|
|
import re
|
||
|
|
import shutil
|
||
|
|
import sys
|
||
|
|
import time
|
||
|
|
from collections import Counter, defaultdict
|
||
|
|
from datetime import datetime
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
SCRIPT_DIR = Path(__file__).parent
|
||
|
|
PROJECT_DIR = SCRIPT_DIR.parent
|
||
|
|
DATA_DIR = PROJECT_DIR / "data"
|
||
|
|
|
||
|
|
LLM_ENDPOINT = "http://192.168.1.100:8853/v1d/chat/completions"
|
||
|
|
LLM_MODEL = "THUDM-GLM4-32B"
|
||
|
|
|
||
|
|
VOCAB_CSV = DATA_DIR / "folksy_vocab.csv"
|
||
|
|
CANDIDATE_CSV = DATA_DIR / "candidate_additions.csv"
|
||
|
|
|
||
|
|
# Valid categories from the existing vocabulary
|
||
|
|
VALID_CATEGORIES = {
|
||
|
|
"animal", "beverage", "bird", "building", "clothing", "container", "crop",
|
||
|
|
"fabric", "fish", "flower", "food", "fruit", "furniture", "grain", "herb",
|
||
|
|
"insect", "instrument", "landscape", "material", "metal", "mineral",
|
||
|
|
"organism", "plant", "rock", "seed", "shelter", "spice", "stone",
|
||
|
|
"structure", "tool", "tree", "vegetable", "vehicle", "water", "weapon", "wood",
|
||
|
|
}
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# Exclusion lists
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
# Abstract concepts, emotions, processes — not concrete enough for folksy vocab
|
||
|
|
EXCLUDE_ABSTRACT = {
|
||
|
|
"ecosystem", "satisfaction", "fullness", "warmth", "fear", "relaxation",
|
||
|
|
"growth", "interest", "nature", "protection", "digestion", "injury",
|
||
|
|
"decoration", "construction", "landscape", "noise", "sound", "energy",
|
||
|
|
"nourishment", "nutrition", "pollination", "sustainability", "tradition",
|
||
|
|
"biodiversity", "symbolism", "elegance", "resilience", "patience",
|
||
|
|
"beauty", "abundance", "fertility", "creativity", "harmony", "comfort",
|
||
|
|
"curiosity", "companionship", "loyalty", "aggression", "alertness",
|
||
|
|
"camouflage", "predation", "migration", "hibernation", "decomposition",
|
||
|
|
"erosion", "combustion", "fermentation", "oxidation", "corrosion",
|
||
|
|
"photosynthesis", "respiration", "evaporation", "precipitation",
|
||
|
|
"transpiration", "germination", "excitement", "enjoyment", "satiety",
|
||
|
|
"stability", "organization", "fragrance", "moisture", "wildlife",
|
||
|
|
"preservation", "conversation", "inspiration", "storage", "observation",
|
||
|
|
"hydration", "destruction", "entertainment", "education", "knowledge",
|
||
|
|
"safety", "practice", "research", "skill", "space", "license",
|
||
|
|
"collection", "habitat", "pollution", "health", "vibration", "wonder",
|
||
|
|
"awe", "refreshment", "irritation", "happiness", "joy", "damage",
|
||
|
|
"death", "pain", "thirst", "fear", "alarm", "contents", "ingredients",
|
||
|
|
"electricity", "oxygen", "navigation", "recreation", "meditation",
|
||
|
|
"nutrition", "celebration", "communication", "imagination", "devotion",
|
||
|
|
"ambition", "endurance", "independence", "discipline", "cooperation",
|
||
|
|
"sweetness", "fullness", "aroma", "flavor", "fragrance", "texture",
|
||
|
|
"smell", "color", "contents", "surface", "bottom", "edge",
|
||
|
|
"nutrients", "study", "outfit", "upholstery",
|
||
|
|
}
|
||
|
|
|
||
|
|
# Scientific/technical — not folksy enough for folk wisdom
|
||
|
|
EXCLUDE_TECHNICAL = {
|
||
|
|
"cellulose", "exoskeleton", "protein", "tissue", "cells", "alloy",
|
||
|
|
"cellulose", "enzyme", "chlorophyll", "genome", "photon",
|
||
|
|
"organism", "molecule", "compound", "polymer", "isotope",
|
||
|
|
"ecosystem", "metabolism", "catalyst", "membrane", "chromosome",
|
||
|
|
"cell", "nutrient", "ingredient", "material", "content",
|
||
|
|
}
|
||
|
|
|
||
|
|
# Collective/institutional nouns — not concrete individual things
|
||
|
|
EXCLUDE_INSTITUTIONAL = {
|
||
|
|
"orchestra", "fleet", "arsenal", "toolkit", "collection",
|
||
|
|
"restaurant", "museum", "university", "corporation", "organization",
|
||
|
|
"musician", "breakfast", "dinner", "meal", "dish", "sandwich",
|
||
|
|
"seafood", "refrigerator", "garage", "basement", "park",
|
||
|
|
}
|
||
|
|
|
||
|
|
# Adjectives and properties — useful as HasProperty targets but not as vocab words
|
||
|
|
EXCLUDE_ADJECTIVES = {
|
||
|
|
"small", "large", "heavy", "colorful", "green", "brown", "hard",
|
||
|
|
"white", "round", "sharp", "sturdy", "long", "soft", "flat",
|
||
|
|
"sweet", "bitter", "smooth", "rough", "bright", "dark", "dry",
|
||
|
|
"wet", "thick", "thin", "warm", "cold", "hot", "tall", "short",
|
||
|
|
"red", "blue", "yellow", "black", "grey", "gray", "pink",
|
||
|
|
"fragrant", "loud", "spicy", "sour", "tough", "delicate", "strong",
|
||
|
|
"weak", "light", "dense", "portable", "lightweight", "transparent",
|
||
|
|
"opaque", "flexible", "rigid", "brittle", "elastic", "porous",
|
||
|
|
"compact", "edible", "toxic", "aromatic", "nocturnal", "aquatic",
|
||
|
|
"durable", "cylindrical", "wooden", "shiny", "solid", "narrow",
|
||
|
|
"metallic", "pungent", "juicy", "fast", "powerful", "woody",
|
||
|
|
"fibrous", "savory", "liquid", "enclosed", "rectangular", "wild",
|
||
|
|
"feathered", "leafy", "crunchy", "dangerous", "fuzzy", "slimy",
|
||
|
|
"natural", "waterproof", "electronic",
|
||
|
|
}
|
||
|
|
|
||
|
|
# Words that are clearly verbs or gerunds
|
||
|
|
EXCLUDE_VERBS = {
|
||
|
|
"eating", "cooking", "growing", "fishing", "hunting", "flying",
|
||
|
|
"mining", "flavoring", "singing", "blooming", "holding", "baking",
|
||
|
|
"ripening", "opening", "cutting", "protecting", "seasoning",
|
||
|
|
"storing", "building", "swimming", "brewing", "weaving", "carving",
|
||
|
|
"climbing", "digging", "plowing", "sewing", "spinning", "tanning",
|
||
|
|
"swim", "run", "grow", "eat", "hunt", "peck", "bite", "dive",
|
||
|
|
"crawl", "cut", "shine", "sparkle",
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
def singularize(word):
|
||
|
|
"""Best-effort singularization. Returns (singular, was_plural)."""
|
||
|
|
# Irregular plurals
|
||
|
|
irregulars = {
|
||
|
|
"teeth": "tooth", "feet": "foot", "geese": "goose", "mice": "mouse",
|
||
|
|
"lice": "louse", "dice": "die", "oxen": "ox", "children": "child",
|
||
|
|
"leaves": "leaf", "loaves": "loaf", "halves": "half", "knives": "knife",
|
||
|
|
"lives": "life", "wives": "wife", "wolves": "wolf", "shelves": "shelf",
|
||
|
|
"calves": "calf",
|
||
|
|
}
|
||
|
|
if word in irregulars:
|
||
|
|
return irregulars[word], True
|
||
|
|
|
||
|
|
# -ves -> -f (already covered some above, catch remaining)
|
||
|
|
if word.endswith("ves"):
|
||
|
|
candidate = word[:-3] + "f"
|
||
|
|
return candidate, True
|
||
|
|
|
||
|
|
# -ies -> -y
|
||
|
|
if word.endswith("ies") and len(word) > 4:
|
||
|
|
return word[:-3] + "y", True
|
||
|
|
|
||
|
|
# -ses, -xes, -zes, -ches, -shes -> drop -es
|
||
|
|
if word.endswith(("ses", "xes", "zes", "ches", "shes")):
|
||
|
|
return word[:-2], True
|
||
|
|
|
||
|
|
# -s (but not -ss, -us, -is)
|
||
|
|
if word.endswith("s") and not word.endswith(("ss", "us", "is")):
|
||
|
|
return word[:-1], True
|
||
|
|
|
||
|
|
return word, False
|
||
|
|
|
||
|
|
|
||
|
|
def is_plural_of_existing(word, existing_vocab):
|
||
|
|
"""Check if word is likely a plural form of an existing vocab word."""
|
||
|
|
# word + s
|
||
|
|
if word.endswith("s") and word[:-1] in existing_vocab:
|
||
|
|
return True
|
||
|
|
# word + es
|
||
|
|
if word.endswith("es") and word[:-2] in existing_vocab:
|
||
|
|
return True
|
||
|
|
# word ending ies -> y
|
||
|
|
if word.endswith("ies") and word[:-3] + "y" in existing_vocab:
|
||
|
|
return True
|
||
|
|
# word ending ves -> f/fe
|
||
|
|
if word.endswith("ves"):
|
||
|
|
if word[:-3] + "f" in existing_vocab:
|
||
|
|
return True
|
||
|
|
if word[:-3] + "fe" in existing_vocab:
|
||
|
|
return True
|
||
|
|
return False
|
||
|
|
|
||
|
|
|
||
|
|
def is_plural_of_candidate(word, accepted_words):
|
||
|
|
"""Check if word is a plural of another candidate, or vice versa."""
|
||
|
|
# Is this word a plural of something accepted?
|
||
|
|
if word.endswith("s") and word[:-1] in accepted_words:
|
||
|
|
return True
|
||
|
|
if word.endswith("es") and word[:-2] in accepted_words:
|
||
|
|
return True
|
||
|
|
if word.endswith("ies") and word[:-3] + "y" in accepted_words:
|
||
|
|
return True
|
||
|
|
# Is something accepted a plural of this word?
|
||
|
|
if word + "s" in accepted_words:
|
||
|
|
return True
|
||
|
|
if word + "es" in accepted_words:
|
||
|
|
return True
|
||
|
|
if word.endswith("f") and word[:-1] + "ves" in accepted_words:
|
||
|
|
return True
|
||
|
|
if word.endswith("fe") and word[:-2] + "ves" in accepted_words:
|
||
|
|
return True
|
||
|
|
return False
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# LLM categorization
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
CATEGORIZE_SYSTEM = """You are a vocabulary categorizer. Given a list of concrete nouns, assign each one to one or more categories from this fixed list:
|
||
|
|
|
||
|
|
animal, beverage, bird, building, clothing, container, crop, fabric, fish, flower, food, fruit, furniture, grain, herb, insect, instrument, landscape, material, metal, mineral, organism, plant, rock, seed, shelter, spice, stone, structure, tool, tree, vegetable, vehicle, water, weapon, wood
|
||
|
|
|
||
|
|
Rules:
|
||
|
|
- Use ONLY categories from the list above
|
||
|
|
- A word can have multiple categories (e.g., "brick" -> material, stone)
|
||
|
|
- If a word fits none of the categories well, output SKIP
|
||
|
|
- Output format: word: category1, category2
|
||
|
|
- One word per line"""
|
||
|
|
|
||
|
|
CATEGORIZE_USER = """Categorize these words:
|
||
|
|
{word_list}"""
|
||
|
|
|
||
|
|
|
||
|
|
def llm_chat_completion(messages, max_retries=3):
|
||
|
|
"""Chat completion with retry logic."""
|
||
|
|
import requests
|
||
|
|
|
||
|
|
for attempt in range(max_retries):
|
||
|
|
try:
|
||
|
|
resp = requests.post(LLM_ENDPOINT, json={
|
||
|
|
"model": LLM_MODEL,
|
||
|
|
"messages": messages,
|
||
|
|
}, timeout=120)
|
||
|
|
resp.raise_for_status()
|
||
|
|
data = resp.json()
|
||
|
|
return data["choices"][0]["message"]["content"]
|
||
|
|
except Exception as e:
|
||
|
|
wait = (2 ** attempt)
|
||
|
|
print(f" LLM call failed (attempt {attempt+1}/{max_retries}): {e}",
|
||
|
|
file=sys.stderr)
|
||
|
|
if attempt < max_retries - 1:
|
||
|
|
print(f" Retrying in {wait}s...", file=sys.stderr)
|
||
|
|
time.sleep(wait)
|
||
|
|
else:
|
||
|
|
print(f" Giving up on this batch.", file=sys.stderr)
|
||
|
|
return None
|
||
|
|
|
||
|
|
|
||
|
|
def parse_categories(response_text, valid_words):
|
||
|
|
"""Parse LLM categorization response."""
|
||
|
|
result = {}
|
||
|
|
if not response_text:
|
||
|
|
return result
|
||
|
|
|
||
|
|
for line in response_text.strip().split("\n"):
|
||
|
|
line = line.strip()
|
||
|
|
if not line:
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Strip bullets/numbers
|
||
|
|
line = re.sub(r"^[\d]+[.)]\s*", "", line)
|
||
|
|
line = re.sub(r"^[-*•]\s*", "", line)
|
||
|
|
line = line.strip()
|
||
|
|
|
||
|
|
# Match: word: cat1, cat2
|
||
|
|
match = re.match(r"^(\w+)\s*:\s*(.+)$", line)
|
||
|
|
if not match:
|
||
|
|
continue
|
||
|
|
|
||
|
|
word = match.group(1).strip().lower()
|
||
|
|
cats_raw = match.group(2).strip()
|
||
|
|
|
||
|
|
if "SKIP" in cats_raw.upper():
|
||
|
|
continue
|
||
|
|
|
||
|
|
cats = []
|
||
|
|
for c in cats_raw.split(","):
|
||
|
|
c = c.strip().lower()
|
||
|
|
if c in VALID_CATEGORIES:
|
||
|
|
cats.append(c)
|
||
|
|
|
||
|
|
if word in valid_words and cats:
|
||
|
|
result[word] = cats
|
||
|
|
|
||
|
|
return result
|
||
|
|
|
||
|
|
|
||
|
|
def categorize_words(words, batch_size=25):
|
||
|
|
"""Categorize words using the LLM in batches."""
|
||
|
|
all_categories = {}
|
||
|
|
word_set = set(words)
|
||
|
|
|
||
|
|
for i in range(0, len(words), batch_size):
|
||
|
|
batch = words[i:i + batch_size]
|
||
|
|
word_list = "\n".join(f"- {w}" for w in batch)
|
||
|
|
|
||
|
|
messages = [
|
||
|
|
{"role": "system", "content": CATEGORIZE_SYSTEM},
|
||
|
|
{"role": "user", "content": CATEGORIZE_USER.format(word_list=word_list)},
|
||
|
|
]
|
||
|
|
|
||
|
|
response = llm_chat_completion(messages)
|
||
|
|
parsed = parse_categories(response, word_set)
|
||
|
|
all_categories.update(parsed)
|
||
|
|
|
||
|
|
categorized = len(parsed)
|
||
|
|
print(f" Batch {i // batch_size + 1}: {categorized}/{len(batch)} categorized")
|
||
|
|
time.sleep(0.1)
|
||
|
|
|
||
|
|
return all_categories
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# Main
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
def main():
|
||
|
|
parser = argparse.ArgumentParser(
|
||
|
|
description="Expand folksy vocabulary with LLM-suggested candidates."
|
||
|
|
)
|
||
|
|
parser.add_argument("--min-citations", type=int, default=5,
|
||
|
|
help="Minimum number of vocab words that suggested this candidate (default: 5)")
|
||
|
|
parser.add_argument("--dry-run", action="store_true",
|
||
|
|
help="Show what would be added without modifying files")
|
||
|
|
parser.add_argument("--no-llm", action="store_true",
|
||
|
|
help="Skip LLM categorization (use placeholder categories)")
|
||
|
|
|
||
|
|
args = parser.parse_args()
|
||
|
|
|
||
|
|
# Load existing vocab
|
||
|
|
existing_vocab = {}
|
||
|
|
with open(VOCAB_CSV, newline="", encoding="utf-8") as f:
|
||
|
|
for row in csv.DictReader(f):
|
||
|
|
existing_vocab[row["word"]] = row
|
||
|
|
existing_words = set(existing_vocab.keys())
|
||
|
|
print(f"Existing vocabulary: {len(existing_words)} words")
|
||
|
|
|
||
|
|
# Load candidates
|
||
|
|
candidates = []
|
||
|
|
with open(CANDIDATE_CSV, newline="", encoding="utf-8") as f:
|
||
|
|
for row in csv.DictReader(f):
|
||
|
|
candidates.append(row)
|
||
|
|
|
||
|
|
# Aggregate: count unique sources per candidate word
|
||
|
|
word_sources = defaultdict(set)
|
||
|
|
for c in candidates:
|
||
|
|
word_sources[c["word"]].add(c["suggested_by"])
|
||
|
|
|
||
|
|
print(f"Total candidate rows: {len(candidates)}")
|
||
|
|
print(f"Unique candidate words: {len(word_sources)}")
|
||
|
|
|
||
|
|
# Normalize plurals: merge citation counts into singular forms
|
||
|
|
normalized_sources = defaultdict(set)
|
||
|
|
for word, sources in word_sources.items():
|
||
|
|
singular, was_plural = singularize(word)
|
||
|
|
# Merge into the singular form
|
||
|
|
normalized_sources[singular].update(sources)
|
||
|
|
# Replace word_sources with normalized version
|
||
|
|
word_sources = {w: srcs for w, srcs in normalized_sources.items()}
|
||
|
|
print(f"After singularization: {len(word_sources)} unique candidates")
|
||
|
|
|
||
|
|
# Filter
|
||
|
|
accepted = []
|
||
|
|
reject_reasons = Counter()
|
||
|
|
|
||
|
|
# Sort by citation count descending for consistent ordering
|
||
|
|
sorted_candidates = sorted(word_sources.items(), key=lambda x: len(x[1]), reverse=True)
|
||
|
|
accepted_set = set()
|
||
|
|
|
||
|
|
for word, sources in sorted_candidates:
|
||
|
|
citation_count = len(sources)
|
||
|
|
|
||
|
|
# Minimum citation threshold
|
||
|
|
if citation_count < args.min_citations:
|
||
|
|
reject_reasons["below_threshold"] += 1
|
||
|
|
continue
|
||
|
|
|
||
|
|
# No multi-word (underscore) candidates
|
||
|
|
if "_" in word:
|
||
|
|
reject_reasons["multi_word"] += 1
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Already in vocab
|
||
|
|
if word in existing_words:
|
||
|
|
reject_reasons["already_in_vocab"] += 1
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Exclude abstracts
|
||
|
|
if word in EXCLUDE_ABSTRACT:
|
||
|
|
reject_reasons["abstract"] += 1
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Exclude adjectives
|
||
|
|
if word in EXCLUDE_ADJECTIVES:
|
||
|
|
reject_reasons["adjective"] += 1
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Exclude verbs/gerunds
|
||
|
|
if word in EXCLUDE_VERBS:
|
||
|
|
reject_reasons["verb_gerund"] += 1
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Exclude technical/scientific
|
||
|
|
if word in EXCLUDE_TECHNICAL:
|
||
|
|
reject_reasons["technical"] += 1
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Exclude institutional/collective
|
||
|
|
if word in EXCLUDE_INSTITUTIONAL:
|
||
|
|
reject_reasons["institutional"] += 1
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Gerund pattern catch-all (but allow exceptions)
|
||
|
|
if word.endswith("ing") and word not in {"ring", "spring", "string", "wing", "ceiling"}:
|
||
|
|
reject_reasons["gerund_pattern"] += 1
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Exclude plurals of existing vocab
|
||
|
|
if is_plural_of_existing(word, existing_words):
|
||
|
|
reject_reasons["plural_of_existing"] += 1
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Exclude plurals of already-accepted candidates
|
||
|
|
if is_plural_of_candidate(word, accepted_set):
|
||
|
|
reject_reasons["plural_of_candidate"] += 1
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Single character
|
||
|
|
if len(word) < 2:
|
||
|
|
reject_reasons["too_short"] += 1
|
||
|
|
continue
|
||
|
|
|
||
|
|
accepted.append((word, citation_count))
|
||
|
|
accepted_set.add(word)
|
||
|
|
|
||
|
|
print(f"\nFiltering results:")
|
||
|
|
print(f" Accepted: {len(accepted)}")
|
||
|
|
for reason, count in reject_reasons.most_common():
|
||
|
|
print(f" Rejected ({reason}): {count}")
|
||
|
|
|
||
|
|
if not accepted:
|
||
|
|
print("\nNo candidates passed filtering.")
|
||
|
|
return
|
||
|
|
|
||
|
|
# Show accepted words
|
||
|
|
print(f"\nAccepted candidates ({len(accepted)}):")
|
||
|
|
for word, count in accepted:
|
||
|
|
print(f" {word:25s} cited by {count:3d} vocab words")
|
||
|
|
|
||
|
|
if args.dry_run:
|
||
|
|
print(f"\nDry run complete. Would add {len(accepted)} words to vocabulary.")
|
||
|
|
return
|
||
|
|
|
||
|
|
# Categorize with LLM
|
||
|
|
words_to_categorize = [w for w, _ in accepted]
|
||
|
|
|
||
|
|
if args.no_llm:
|
||
|
|
print("\nSkipping LLM categorization (--no-llm). Using 'material' as placeholder.")
|
||
|
|
categories = {w: ["material"] for w in words_to_categorize}
|
||
|
|
else:
|
||
|
|
print(f"\nCategorizing {len(words_to_categorize)} words with LLM...")
|
||
|
|
categories = categorize_words(words_to_categorize)
|
||
|
|
|
||
|
|
# Words the LLM couldn't categorize get skipped
|
||
|
|
uncategorized = [w for w in words_to_categorize if w not in categories]
|
||
|
|
if uncategorized:
|
||
|
|
print(f"\n {len(uncategorized)} words could not be categorized (skipped):")
|
||
|
|
for w in uncategorized:
|
||
|
|
print(f" {w}")
|
||
|
|
|
||
|
|
# Build new vocab entries
|
||
|
|
new_entries = []
|
||
|
|
for word, citation_count in accepted:
|
||
|
|
if word not in categories:
|
||
|
|
continue
|
||
|
|
cats = categories[word]
|
||
|
|
new_entries.append({
|
||
|
|
"word": word,
|
||
|
|
"categories": ",".join(cats),
|
||
|
|
"tangibility_score": "0.80",
|
||
|
|
"conceptnet_edge_count": "0",
|
||
|
|
"frequency_rank": "0",
|
||
|
|
})
|
||
|
|
|
||
|
|
if not new_entries:
|
||
|
|
print("\nNo entries to add after categorization.")
|
||
|
|
return
|
||
|
|
|
||
|
|
# Backup existing vocab
|
||
|
|
backup_path = VOCAB_CSV.with_suffix(f".csv.bak.{datetime.now().strftime('%Y%m%d_%H%M%S')}")
|
||
|
|
shutil.copy2(VOCAB_CSV, backup_path)
|
||
|
|
print(f"\nBacked up vocabulary to {backup_path.name}")
|
||
|
|
|
||
|
|
# Append to vocab CSV
|
||
|
|
with open(VOCAB_CSV, "a", newline="", encoding="utf-8") as f:
|
||
|
|
writer = csv.DictWriter(f, fieldnames=["word", "categories", "tangibility_score",
|
||
|
|
"conceptnet_edge_count", "frequency_rank"])
|
||
|
|
for entry in new_entries:
|
||
|
|
writer.writerow(entry)
|
||
|
|
|
||
|
|
print(f"\nAdded {len(new_entries)} words to {VOCAB_CSV.name}")
|
||
|
|
print(f"New vocabulary size: {len(existing_words) + len(new_entries)}")
|
||
|
|
|
||
|
|
# Summary by category
|
||
|
|
cat_counts = Counter()
|
||
|
|
for entry in new_entries:
|
||
|
|
for c in entry["categories"].split(","):
|
||
|
|
cat_counts[c.strip()] += 1
|
||
|
|
print(f"\nNew words by category:")
|
||
|
|
for cat, count in cat_counts.most_common():
|
||
|
|
print(f" {cat:20s} {count:3d}")
|
||
|
|
|
||
|
|
print(f"\nNext step: run 'python scripts/enhance_graph.py --phase 1' to generate edges for new words.")
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|