#!/usr/bin/env python3 """ Build folksy vocabulary CSV from ConceptNet5 PostgreSQL database. Steps: 1. Gather candidates via IsA categories 2. Filter to single-word concrete nouns 3. Calculate tangibility score 4. Count total edges 5. Add manual additions 6. Write output CSV """ import psycopg2 import csv import sys DB_NAME = "conceptnet5" # IsA categories and their node IDs (pre-looked up) CATEGORY_IDS = { 20865: 'animal', 22802: 'beverage', 20866: 'bird', 40218: 'building', 21578: 'clothing', 144957: 'container', 26028: 'crop', 148705: 'fabric', 22922: 'fish', 26249: 'flower', 22803: 'food', 31187: 'fruit', 22610: 'furniture', 114948: 'grain', 641297: 'herb', 152432: 'insect', 152437: 'instrument', 153470: 'livestock', 33562: 'material', 25893: 'metal', 20869: 'mineral', 20872: 'plant', 37511: 'rock', 25753: 'seed', 44101: 'spice', 37357: 'stone', 159255: 'tool', 40174: 'tree', 20874: 'vegetable', 144388: 'vehicle', 156331: 'weapon', 31507: 'wood' } # Relation IDs (pre-looked up from relations table) RELATION_IDS = { 'AtLocation': 1, 'MadeOf': 25, 'PartOf': 33, 'UsedFor': 39, 'HasA': 15, 'ReceivesAction': 34, 'CreatedBy': 5, 'HasProperty': 20, 'Causes': 3, 'MotivatedByGoal': 27, 'CausesDesire': 4, 'Desires': 8, 'HasSubevent': 21 } CONCRETE_RELS = [RELATION_IDS[r] for r in ['AtLocation', 'MadeOf', 'PartOf', 'UsedFor', 'HasA', 'ReceivesAction', 'CreatedBy']] ABSTRACT_RELS = [RELATION_IDS[r] for r in ['HasProperty', 'Causes', 'MotivatedByGoal', 'CausesDesire', 'Desires', 'HasSubevent']] MANUAL_ADDITIONS = [ 'well', 'fence', 'barn', 'creek', 'porch', 'chimney', 'saddle', 'hearth', 'kettle', 'plow', 'silo', 'trough', 'yoke', 'anvil', 'bellows', 'thimble', 'lantern', 'candle', 'broom', 'bucket', 'ladder', 'rope', 'nail', 'hay', 'straw', 'wool', 'leather', 'tar', 'wax', 'cork', 'flint', 'chalk', 'clay', 'ash', 'soot', 'rust', 'mold', 'moss', 'bark', 'root', 'stem', 'thorn', 'vine', 'husk', 'shell', 'pit', 'den', 'nest', 'burrow', 'coop', 'stable', 'pasture', 'meadow', 'orchard', 'garden', 'pond', 'ditch', 'ridge', 'hollow', 'furrow' ] # Common-sense categories for manual additions that might not have IsA edges MANUAL_CATEGORIES = { 'well': 'structure', 'fence': 'structure', 'barn': 'building', 'creek': 'water,landscape', 'porch': 'structure', 'chimney': 'structure', 'saddle': 'tool', 'hearth': 'structure', 'kettle': 'container', 'plow': 'tool', 'silo': 'building', 'trough': 'container', 'yoke': 'tool', 'anvil': 'tool', 'bellows': 'tool', 'thimble': 'tool', 'lantern': 'tool', 'candle': 'tool', 'broom': 'tool', 'bucket': 'container', 'ladder': 'tool', 'rope': 'material', 'nail': 'tool', 'hay': 'plant,crop', 'straw': 'material,crop', 'wool': 'fabric,material', 'leather': 'fabric,material', 'tar': 'material', 'wax': 'material', 'cork': 'material', 'flint': 'stone', 'chalk': 'material,mineral', 'clay': 'material', 'ash': 'material', 'soot': 'material', 'rust': 'material', 'mold': 'organism', 'moss': 'plant', 'bark': 'plant', 'root': 'plant', 'stem': 'plant', 'thorn': 'plant', 'vine': 'plant', 'husk': 'plant', 'shell': 'container', 'pit': 'seed,landscape', 'den': 'shelter', 'nest': 'shelter', 'burrow': 'shelter', 'coop': 'building', 'stable': 'building', 'pasture': 'landscape', 'meadow': 'landscape', 'orchard': 'landscape', 'garden': 'landscape', 'pond': 'water,landscape', 'ditch': 'landscape', 'ridge': 'landscape', 'hollow': 'landscape', 'furrow': 'landscape' } # Words to exclude (misspellings, plural forms, overly abstract, non-folksy) EXCLUDE_WORDS = { 'bannana', 'brocolli', 'cardimom', 'carary', 'cassorwary', 'cucmber', 'cummin', 'dragonsnap', 'elefefant', 'guitare', 'hollie', 'potoato', 'rhodedendron', 'sandwitch', 'saphire', 'saxiphone', 'soupd', 'tourqouise', 'tiramisu', 'bbq', 'cajun', 'mexican', 'pepsi', 'coke', 'spam', 'accordian', 'comealong', 'rooter', 'tweety', 'guru1', 'softball', 'nutdriver', 'posessions', 'anus', 'bloodsucker', 'whorehouse', 'cuck', # Plurals when singular exists 'blueberries', 'carrots', 'eggs', 'pears', 'peas', 'peaches', 'limes', 'raisins', 'plums', 'rubies', 'emeralds', 'shirts', 'shoes', 'tomatoes', 'potatoes', 'plastics', 'vegetables', 'animals', 'products', 'vertebrates', 'pianos', 'lures', 'pens', 'crampons', # Too technical/non-folksy 'bronchoscope', 'dioptometer', 'calibrachoa', 'brachycome', 'diascia', 'osteospermum', 'nemesia', 'helichrysum', 'scavola', 'silphium', 'cuphea', 'euonymus', 'arborvitae', 'ipomoea', 'bacopa', 'lamium', 'falsecypress', 'boottree', 'sedimentary', 'catheter', 'caltrops', 'argyranthemum', 'sunn', # Too generic/abstract 'creature', 'invertebrate', 'primate', 'marsupial', 'crustacean', 'arthropod', 'avian', 'amphibian', 'rodent', 'pet', 'explosive', 'automatic', 'percussion', 'woodwind', 'laundry', 'products', # fictional 'unicorn', 'dragon', 'pinguin', # remaining misspellings / obscure non-folksy fish 'trumbone', 'eidar', 'monchong', 'opakapaka', 'opah', 'cumquat', } def connect(): return psycopg2.connect(dbname=DB_NAME) def step1_gather_candidates(conn): """Gather all English base single-word nodes that IsA our categories.""" print("Step 1: Gathering IsA candidates...") cur = conn.cursor() category_id_list = ','.join(str(k) for k in CATEGORY_IDS.keys()) cur.execute(f""" SELECT SUBSTRING(n_start.uri FROM 6) AS word, ARRAY_AGG(DISTINCT n_end.id) AS cat_ids FROM edges e JOIN nodes n_start ON e.start_id = n_start.id JOIN nodes n_end ON e.end_id = n_end.id WHERE e.relation_id = 23 AND e.weight >= 1.0 AND n_start.uri LIKE '/c/en/%' AND n_start.uri NOT LIKE '/c/en/%/%%' AND n_start.uri NOT LIKE '/c/en/%%\\_%%' AND n_end.id IN ({category_id_list}) GROUP BY n_start.uri """) candidates = {} for word, cat_ids in cur.fetchall(): if word.startswith('/'): word = word.lstrip('/') if word in EXCLUDE_WORDS: continue categories = sorted(set(CATEGORY_IDS[cid] for cid in cat_ids if cid in CATEGORY_IDS)) candidates[word] = { 'categories': categories, 'tangibility_score': 0.0, 'edge_count': 0 } cur.close() print(f" Found {len(candidates)} candidates after filtering") return candidates def step5_add_manual(conn, candidates): """Add manual additions that aren't already in candidates.""" print("Step 5: Adding manual additions...") added = 0 for word in MANUAL_ADDITIONS: if word not in candidates: cats = MANUAL_CATEGORIES.get(word, 'misc').split(',') candidates[word] = { 'categories': sorted(cats), 'tangibility_score': 0.0, 'edge_count': 0 } added += 1 else: # Merge manual categories with existing existing_cats = set(candidates[word]['categories']) manual_cats = set(MANUAL_CATEGORIES.get(word, '').split(',')) - {''} candidates[word]['categories'] = sorted(existing_cats | manual_cats) print(f" Added {added} new words from manual list") print(f" Total candidates: {len(candidates)}") return candidates def step3_4_tangibility_and_edges(conn, candidates): """Calculate tangibility scores and total edge counts for all candidates.""" print("Steps 3-4: Calculating tangibility scores and edge counts...") cur = conn.cursor() # First, get all node IDs for our candidate words in one query words = list(candidates.keys()) uris = [f'/c/en/{w}' for w in words] # Batch lookup node IDs cur.execute(""" SELECT uri, id FROM nodes WHERE uri = ANY(%s) """, (uris,)) word_to_node_id = {} for uri, nid in cur.fetchall(): word = uri[6:] # strip '/c/en/' (Python 0-indexed: '/c/en/'=6 chars) word_to_node_id[word] = nid # Debug: show a sample sample = list(word_to_node_id.items())[:5] print(f" Sample word->id mappings: {sample}") print(f" Found node IDs for {len(word_to_node_id)}/{len(words)} words") # Words without node IDs - remove them missing = [w for w in words if w not in word_to_node_id] if missing: print(f" Missing from DB (removing): {missing[:20]}...") for w in missing: del candidates[w] if not word_to_node_id: print(" ERROR: No node IDs found!") return candidates node_ids = list(word_to_node_id.values()) node_id_to_word = {v: k for k, v in word_to_node_id.items()} concrete_rel_ids = CONCRETE_RELS abstract_rel_ids = ABSTRACT_RELS all_scored_rels = concrete_rel_ids + abstract_rel_ids # Query: for each node (as start or end), count concrete and abstract edges # We need English-only counterparts, so we filter the other end to /c/en/ # Do this in batches to avoid memory issues batch_size = 200 node_id_list = list(node_ids) for batch_start in range(0, len(node_id_list), batch_size): batch = node_id_list[batch_start:batch_start + batch_size] batch_words = [node_id_to_word[nid] for nid in batch] if batch_start % 1000 == 0: print(f" Processing batch {batch_start}/{len(node_id_list)}...") # Concrete relation counts (as start node) cur.execute(""" SELECT e.start_id, e.relation_id, COUNT(*) FROM edges e JOIN nodes n_other ON e.end_id = n_other.id WHERE e.start_id = ANY(%s) AND e.weight >= 1.0 AND e.relation_id = ANY(%s) AND n_other.uri LIKE '/c/en/%%' GROUP BY e.start_id, e.relation_id """, (batch, all_scored_rels)) for nid, rel_id, cnt in cur.fetchall(): word = node_id_to_word.get(nid) if not word or word not in candidates: continue if rel_id in concrete_rel_ids: candidates[word].setdefault('concrete_count', 0) candidates[word]['concrete_count'] += cnt elif rel_id in abstract_rel_ids: candidates[word].setdefault('abstract_count', 0) candidates[word]['abstract_count'] += cnt # As end node cur.execute(""" SELECT e.end_id, e.relation_id, COUNT(*) FROM edges e JOIN nodes n_other ON e.start_id = n_other.id WHERE e.end_id = ANY(%s) AND e.weight >= 1.0 AND e.relation_id = ANY(%s) AND n_other.uri LIKE '/c/en/%%' GROUP BY e.end_id, e.relation_id """, (batch, all_scored_rels)) for nid, rel_id, cnt in cur.fetchall(): word = node_id_to_word.get(nid) if not word or word not in candidates: continue if rel_id in concrete_rel_ids: candidates[word].setdefault('concrete_count', 0) candidates[word]['concrete_count'] += cnt elif rel_id in abstract_rel_ids: candidates[word].setdefault('abstract_count', 0) candidates[word]['abstract_count'] += cnt # Total edge count (any relation, English counterpart, weight >= 1) cur.execute(""" SELECT start_id, COUNT(*) FROM edges e JOIN nodes n_other ON e.end_id = n_other.id WHERE e.start_id = ANY(%s) AND e.weight >= 1.0 AND n_other.uri LIKE '/c/en/%%' GROUP BY start_id """, (batch,)) for nid, cnt in cur.fetchall(): word = node_id_to_word.get(nid) if word and word in candidates: candidates[word]['edge_count'] += cnt cur.execute(""" SELECT end_id, COUNT(*) FROM edges e JOIN nodes n_other ON e.start_id = n_other.id WHERE e.end_id = ANY(%s) AND e.weight >= 1.0 AND n_other.uri LIKE '/c/en/%%' GROUP BY end_id """, (batch,)) for nid, cnt in cur.fetchall(): word = node_id_to_word.get(nid) if word and word in candidates: candidates[word]['edge_count'] += cnt # Calculate tangibility scores for word, data in candidates.items(): concrete = data.get('concrete_count', 0) abstract = data.get('abstract_count', 0) total = concrete + abstract if total > 0: data['tangibility_score'] = round(concrete / total, 2) else: data['tangibility_score'] = 0.0 cur.close() return candidates def step6_write_output(candidates): """Write the final CSV.""" output_path = '/home/john/Development/folksy-generator/data/folksy_vocab.csv' print(f"Step 6: Writing output to {output_path}") # Sort by edge_count descending sorted_words = sorted(candidates.items(), key=lambda x: x[1]['edge_count'], reverse=True) with open(output_path, 'w', newline='') as f: writer = csv.writer(f) writer.writerow(['word', 'categories', 'tangibility_score', 'conceptnet_edge_count', 'frequency_rank']) for word, data in sorted_words: categories = ','.join(data['categories']) writer.writerow([ word, categories, data['tangibility_score'], data['edge_count'], 0 ]) print(f" Wrote {len(sorted_words)} words") return output_path def main(): conn = connect() try: # Step 1 + 2 (filtering is built into the SQL) candidates = step1_gather_candidates(conn) # Step 5: Add manual additions (before scoring so they get scored too) candidates = step5_add_manual(conn, candidates) # Steps 3 + 4: Tangibility and edge counts candidates = step3_4_tangibility_and_edges(conn, candidates) # Step 6: Write output path = step6_write_output(candidates) # Summary stats scores = [d['tangibility_score'] for d in candidates.values() if d['tangibility_score'] > 0] edges = [d['edge_count'] for d in candidates.values()] print(f"\nSummary:") print(f" Total words: {len(candidates)}") print(f" Words with tangibility > 0: {len(scores)}") if scores: print(f" Avg tangibility: {sum(scores)/len(scores):.2f}") if edges: print(f" Avg edge count: {sum(edges)/len(edges):.1f}") print(f" Max edge count: {max(edges)}") print(f" Min edge count: {min(edges)}") print(f" Output: {path}") finally: conn.close() if __name__ == '__main__': main()