folksy_idioms/scripts/extract_from_conceptnet.py

387 lines
15 KiB
Python
Raw Permalink Normal View History

2026-02-15 14:04:25 -05:00
#!/usr/bin/env python3
"""
Build folksy vocabulary CSV from ConceptNet5 PostgreSQL database.
Steps:
1. Gather candidates via IsA categories
2. Filter to single-word concrete nouns
3. Calculate tangibility score
4. Count total edges
5. Add manual additions
6. Write output CSV
"""
import psycopg2
import csv
import sys
DB_NAME = "conceptnet5"
# IsA categories and their node IDs (pre-looked up)
CATEGORY_IDS = {
20865: 'animal', 22802: 'beverage', 20866: 'bird', 40218: 'building',
21578: 'clothing', 144957: 'container', 26028: 'crop', 148705: 'fabric',
22922: 'fish', 26249: 'flower', 22803: 'food', 31187: 'fruit',
22610: 'furniture', 114948: 'grain', 641297: 'herb', 152432: 'insect',
152437: 'instrument', 153470: 'livestock', 33562: 'material',
25893: 'metal', 20869: 'mineral', 20872: 'plant', 37511: 'rock',
25753: 'seed', 44101: 'spice', 37357: 'stone', 159255: 'tool',
40174: 'tree', 20874: 'vegetable', 144388: 'vehicle', 156331: 'weapon',
31507: 'wood'
}
# Relation IDs (pre-looked up from relations table)
RELATION_IDS = {
'AtLocation': 1, 'MadeOf': 25, 'PartOf': 33, 'UsedFor': 39,
'HasA': 15, 'ReceivesAction': 34, 'CreatedBy': 5,
'HasProperty': 20, 'Causes': 3, 'MotivatedByGoal': 27,
'CausesDesire': 4, 'Desires': 8, 'HasSubevent': 21
}
CONCRETE_RELS = [RELATION_IDS[r] for r in ['AtLocation', 'MadeOf', 'PartOf', 'UsedFor', 'HasA', 'ReceivesAction', 'CreatedBy']]
ABSTRACT_RELS = [RELATION_IDS[r] for r in ['HasProperty', 'Causes', 'MotivatedByGoal', 'CausesDesire', 'Desires', 'HasSubevent']]
MANUAL_ADDITIONS = [
'well', 'fence', 'barn', 'creek', 'porch', 'chimney', 'saddle', 'hearth',
'kettle', 'plow', 'silo', 'trough', 'yoke', 'anvil', 'bellows', 'thimble',
'lantern', 'candle', 'broom', 'bucket', 'ladder', 'rope', 'nail', 'hay',
'straw', 'wool', 'leather', 'tar', 'wax', 'cork', 'flint', 'chalk', 'clay',
'ash', 'soot', 'rust', 'mold', 'moss', 'bark', 'root', 'stem', 'thorn',
'vine', 'husk', 'shell', 'pit', 'den', 'nest', 'burrow', 'coop', 'stable',
'pasture', 'meadow', 'orchard', 'garden', 'pond', 'ditch', 'ridge',
'hollow', 'furrow'
]
# Common-sense categories for manual additions that might not have IsA edges
MANUAL_CATEGORIES = {
'well': 'structure', 'fence': 'structure', 'barn': 'building',
'creek': 'water,landscape', 'porch': 'structure', 'chimney': 'structure',
'saddle': 'tool', 'hearth': 'structure', 'kettle': 'container',
'plow': 'tool', 'silo': 'building', 'trough': 'container',
'yoke': 'tool', 'anvil': 'tool', 'bellows': 'tool',
'thimble': 'tool', 'lantern': 'tool', 'candle': 'tool',
'broom': 'tool', 'bucket': 'container', 'ladder': 'tool',
'rope': 'material', 'nail': 'tool', 'hay': 'plant,crop',
'straw': 'material,crop', 'wool': 'fabric,material',
'leather': 'fabric,material', 'tar': 'material', 'wax': 'material',
'cork': 'material', 'flint': 'stone', 'chalk': 'material,mineral',
'clay': 'material', 'ash': 'material', 'soot': 'material',
'rust': 'material', 'mold': 'organism', 'moss': 'plant',
'bark': 'plant', 'root': 'plant', 'stem': 'plant',
'thorn': 'plant', 'vine': 'plant', 'husk': 'plant',
'shell': 'container', 'pit': 'seed,landscape', 'den': 'shelter',
'nest': 'shelter', 'burrow': 'shelter', 'coop': 'building',
'stable': 'building', 'pasture': 'landscape', 'meadow': 'landscape',
'orchard': 'landscape', 'garden': 'landscape', 'pond': 'water,landscape',
'ditch': 'landscape', 'ridge': 'landscape', 'hollow': 'landscape',
'furrow': 'landscape'
}
# Words to exclude (misspellings, plural forms, overly abstract, non-folksy)
EXCLUDE_WORDS = {
'bannana', 'brocolli', 'cardimom', 'carary', 'cassorwary', 'cucmber',
'cummin', 'dragonsnap', 'elefefant', 'guitare', 'hollie', 'potoato',
'rhodedendron', 'sandwitch', 'saphire', 'saxiphone', 'soupd', 'tourqouise',
'tiramisu', 'bbq', 'cajun', 'mexican', 'pepsi', 'coke', 'spam', 'accordian',
'comealong', 'rooter', 'tweety', 'guru1', 'softball', 'nutdriver',
'posessions', 'anus', 'bloodsucker', 'whorehouse', 'cuck',
# Plurals when singular exists
'blueberries', 'carrots', 'eggs', 'pears', 'peas', 'peaches', 'limes',
'raisins', 'plums', 'rubies', 'emeralds', 'shirts', 'shoes', 'tomatoes',
'potatoes', 'plastics', 'vegetables', 'animals', 'products', 'vertebrates',
'pianos', 'lures', 'pens', 'crampons',
# Too technical/non-folksy
'bronchoscope', 'dioptometer', 'calibrachoa', 'brachycome', 'diascia',
'osteospermum', 'nemesia', 'helichrysum', 'scavola', 'silphium',
'cuphea', 'euonymus', 'arborvitae', 'ipomoea', 'bacopa', 'lamium',
'falsecypress', 'boottree', 'sedimentary', 'catheter', 'caltrops',
'argyranthemum', 'sunn',
# Too generic/abstract
'creature', 'invertebrate', 'primate', 'marsupial', 'crustacean',
'arthropod', 'avian', 'amphibian', 'rodent', 'pet', 'explosive',
'automatic', 'percussion', 'woodwind', 'laundry', 'products',
# fictional
'unicorn', 'dragon', 'pinguin',
# remaining misspellings / obscure non-folksy fish
'trumbone', 'eidar', 'monchong', 'opakapaka', 'opah', 'cumquat',
}
def connect():
return psycopg2.connect(dbname=DB_NAME)
def step1_gather_candidates(conn):
"""Gather all English base single-word nodes that IsA our categories."""
print("Step 1: Gathering IsA candidates...")
cur = conn.cursor()
category_id_list = ','.join(str(k) for k in CATEGORY_IDS.keys())
cur.execute(f"""
SELECT
SUBSTRING(n_start.uri FROM 6) AS word,
ARRAY_AGG(DISTINCT n_end.id) AS cat_ids
FROM edges e
JOIN nodes n_start ON e.start_id = n_start.id
JOIN nodes n_end ON e.end_id = n_end.id
WHERE e.relation_id = 23
AND e.weight >= 1.0
AND n_start.uri LIKE '/c/en/%'
AND n_start.uri NOT LIKE '/c/en/%/%%'
AND n_start.uri NOT LIKE '/c/en/%%\\_%%'
AND n_end.id IN ({category_id_list})
GROUP BY n_start.uri
""")
candidates = {}
for word, cat_ids in cur.fetchall():
if word.startswith('/'):
word = word.lstrip('/')
if word in EXCLUDE_WORDS:
continue
categories = sorted(set(CATEGORY_IDS[cid] for cid in cat_ids if cid in CATEGORY_IDS))
candidates[word] = {
'categories': categories,
'tangibility_score': 0.0,
'edge_count': 0
}
cur.close()
print(f" Found {len(candidates)} candidates after filtering")
return candidates
def step5_add_manual(conn, candidates):
"""Add manual additions that aren't already in candidates."""
print("Step 5: Adding manual additions...")
added = 0
for word in MANUAL_ADDITIONS:
if word not in candidates:
cats = MANUAL_CATEGORIES.get(word, 'misc').split(',')
candidates[word] = {
'categories': sorted(cats),
'tangibility_score': 0.0,
'edge_count': 0
}
added += 1
else:
# Merge manual categories with existing
existing_cats = set(candidates[word]['categories'])
manual_cats = set(MANUAL_CATEGORIES.get(word, '').split(',')) - {''}
candidates[word]['categories'] = sorted(existing_cats | manual_cats)
print(f" Added {added} new words from manual list")
print(f" Total candidates: {len(candidates)}")
return candidates
def step3_4_tangibility_and_edges(conn, candidates):
"""Calculate tangibility scores and total edge counts for all candidates."""
print("Steps 3-4: Calculating tangibility scores and edge counts...")
cur = conn.cursor()
# First, get all node IDs for our candidate words in one query
words = list(candidates.keys())
uris = [f'/c/en/{w}' for w in words]
# Batch lookup node IDs
cur.execute("""
SELECT uri, id FROM nodes
WHERE uri = ANY(%s)
""", (uris,))
word_to_node_id = {}
for uri, nid in cur.fetchall():
word = uri[6:] # strip '/c/en/' (Python 0-indexed: '/c/en/'=6 chars)
word_to_node_id[word] = nid
# Debug: show a sample
sample = list(word_to_node_id.items())[:5]
print(f" Sample word->id mappings: {sample}")
print(f" Found node IDs for {len(word_to_node_id)}/{len(words)} words")
# Words without node IDs - remove them
missing = [w for w in words if w not in word_to_node_id]
if missing:
print(f" Missing from DB (removing): {missing[:20]}...")
for w in missing:
del candidates[w]
if not word_to_node_id:
print(" ERROR: No node IDs found!")
return candidates
node_ids = list(word_to_node_id.values())
node_id_to_word = {v: k for k, v in word_to_node_id.items()}
concrete_rel_ids = CONCRETE_RELS
abstract_rel_ids = ABSTRACT_RELS
all_scored_rels = concrete_rel_ids + abstract_rel_ids
# Query: for each node (as start or end), count concrete and abstract edges
# We need English-only counterparts, so we filter the other end to /c/en/
# Do this in batches to avoid memory issues
batch_size = 200
node_id_list = list(node_ids)
for batch_start in range(0, len(node_id_list), batch_size):
batch = node_id_list[batch_start:batch_start + batch_size]
batch_words = [node_id_to_word[nid] for nid in batch]
if batch_start % 1000 == 0:
print(f" Processing batch {batch_start}/{len(node_id_list)}...")
# Concrete relation counts (as start node)
cur.execute("""
SELECT e.start_id, e.relation_id, COUNT(*)
FROM edges e
JOIN nodes n_other ON e.end_id = n_other.id
WHERE e.start_id = ANY(%s)
AND e.weight >= 1.0
AND e.relation_id = ANY(%s)
AND n_other.uri LIKE '/c/en/%%'
GROUP BY e.start_id, e.relation_id
""", (batch, all_scored_rels))
for nid, rel_id, cnt in cur.fetchall():
word = node_id_to_word.get(nid)
if not word or word not in candidates:
continue
if rel_id in concrete_rel_ids:
candidates[word].setdefault('concrete_count', 0)
candidates[word]['concrete_count'] += cnt
elif rel_id in abstract_rel_ids:
candidates[word].setdefault('abstract_count', 0)
candidates[word]['abstract_count'] += cnt
# As end node
cur.execute("""
SELECT e.end_id, e.relation_id, COUNT(*)
FROM edges e
JOIN nodes n_other ON e.start_id = n_other.id
WHERE e.end_id = ANY(%s)
AND e.weight >= 1.0
AND e.relation_id = ANY(%s)
AND n_other.uri LIKE '/c/en/%%'
GROUP BY e.end_id, e.relation_id
""", (batch, all_scored_rels))
for nid, rel_id, cnt in cur.fetchall():
word = node_id_to_word.get(nid)
if not word or word not in candidates:
continue
if rel_id in concrete_rel_ids:
candidates[word].setdefault('concrete_count', 0)
candidates[word]['concrete_count'] += cnt
elif rel_id in abstract_rel_ids:
candidates[word].setdefault('abstract_count', 0)
candidates[word]['abstract_count'] += cnt
# Total edge count (any relation, English counterpart, weight >= 1)
cur.execute("""
SELECT start_id, COUNT(*)
FROM edges e
JOIN nodes n_other ON e.end_id = n_other.id
WHERE e.start_id = ANY(%s)
AND e.weight >= 1.0
AND n_other.uri LIKE '/c/en/%%'
GROUP BY start_id
""", (batch,))
for nid, cnt in cur.fetchall():
word = node_id_to_word.get(nid)
if word and word in candidates:
candidates[word]['edge_count'] += cnt
cur.execute("""
SELECT end_id, COUNT(*)
FROM edges e
JOIN nodes n_other ON e.start_id = n_other.id
WHERE e.end_id = ANY(%s)
AND e.weight >= 1.0
AND n_other.uri LIKE '/c/en/%%'
GROUP BY end_id
""", (batch,))
for nid, cnt in cur.fetchall():
word = node_id_to_word.get(nid)
if word and word in candidates:
candidates[word]['edge_count'] += cnt
# Calculate tangibility scores
for word, data in candidates.items():
concrete = data.get('concrete_count', 0)
abstract = data.get('abstract_count', 0)
total = concrete + abstract
if total > 0:
data['tangibility_score'] = round(concrete / total, 2)
else:
data['tangibility_score'] = 0.0
cur.close()
return candidates
def step6_write_output(candidates):
"""Write the final CSV."""
output_path = '/home/john/Development/folksy-generator/data/folksy_vocab.csv'
print(f"Step 6: Writing output to {output_path}")
# Sort by edge_count descending
sorted_words = sorted(candidates.items(), key=lambda x: x[1]['edge_count'], reverse=True)
with open(output_path, 'w', newline='') as f:
writer = csv.writer(f)
writer.writerow(['word', 'categories', 'tangibility_score', 'conceptnet_edge_count', 'frequency_rank'])
for word, data in sorted_words:
categories = ','.join(data['categories'])
writer.writerow([
word,
categories,
data['tangibility_score'],
data['edge_count'],
0
])
print(f" Wrote {len(sorted_words)} words")
return output_path
def main():
conn = connect()
try:
# Step 1 + 2 (filtering is built into the SQL)
candidates = step1_gather_candidates(conn)
# Step 5: Add manual additions (before scoring so they get scored too)
candidates = step5_add_manual(conn, candidates)
# Steps 3 + 4: Tangibility and edge counts
candidates = step3_4_tangibility_and_edges(conn, candidates)
# Step 6: Write output
path = step6_write_output(candidates)
# Summary stats
scores = [d['tangibility_score'] for d in candidates.values() if d['tangibility_score'] > 0]
edges = [d['edge_count'] for d in candidates.values()]
print(f"\nSummary:")
print(f" Total words: {len(candidates)}")
print(f" Words with tangibility > 0: {len(scores)}")
if scores:
print(f" Avg tangibility: {sum(scores)/len(scores):.2f}")
if edges:
print(f" Avg edge count: {sum(edges)/len(edges):.1f}")
print(f" Max edge count: {max(edges)}")
print(f" Min edge count: {min(edges)}")
print(f" Output: {path}")
finally:
conn.close()
if __name__ == '__main__':
main()