folksy_idioms/scripts/extract_from_conceptnet.py

#!/usr/bin/env python3
"""
Build folksy vocabulary CSV from ConceptNet5 PostgreSQL database.
Steps:
1. Gather candidates via IsA categories
2. Filter to single-word concrete nouns
3. Calculate tangibility score
4. Count total edges
5. Add manual additions
6. Write output CSV
"""

import psycopg2
import csv
import sys

DB_NAME = "conceptnet5"

# IsA categories and their node IDs (pre-looked up)
CATEGORY_IDS = {
    20865: 'animal', 22802: 'beverage', 20866: 'bird', 40218: 'building',
    21578: 'clothing', 144957: 'container', 26028: 'crop', 148705: 'fabric',
    22922: 'fish', 26249: 'flower', 22803: 'food', 31187: 'fruit',
    22610: 'furniture', 114948: 'grain', 641297: 'herb', 152432: 'insect',
    152437: 'instrument', 153470: 'livestock', 33562: 'material',
    25893: 'metal', 20869: 'mineral', 20872: 'plant', 37511: 'rock',
    25753: 'seed', 44101: 'spice', 37357: 'stone', 159255: 'tool',
    40174: 'tree', 20874: 'vegetable', 144388: 'vehicle', 156331: 'weapon',
    31507: 'wood'
}

# Relation IDs (pre-looked up from relations table)
RELATION_IDS = {
    'AtLocation': 1, 'MadeOf': 25, 'PartOf': 33, 'UsedFor': 39,
    'HasA': 15, 'ReceivesAction': 34, 'CreatedBy': 5,
    'HasProperty': 20, 'Causes': 3, 'MotivatedByGoal': 27,
    'CausesDesire': 4, 'Desires': 8, 'HasSubevent': 21
}

CONCRETE_RELS = [RELATION_IDS[r] for r in ['AtLocation', 'MadeOf', 'PartOf', 'UsedFor', 'HasA', 'ReceivesAction', 'CreatedBy']]
ABSTRACT_RELS = [RELATION_IDS[r] for r in ['HasProperty', 'Causes', 'MotivatedByGoal', 'CausesDesire', 'Desires', 'HasSubevent']]

MANUAL_ADDITIONS = [
    'well', 'fence', 'barn', 'creek', 'porch', 'chimney', 'saddle', 'hearth',
    'kettle', 'plow', 'silo', 'trough', 'yoke', 'anvil', 'bellows', 'thimble',
    'lantern', 'candle', 'broom', 'bucket', 'ladder', 'rope', 'nail', 'hay',
    'straw', 'wool', 'leather', 'tar', 'wax', 'cork', 'flint', 'chalk', 'clay',
    'ash', 'soot', 'rust', 'mold', 'moss', 'bark', 'root', 'stem', 'thorn',
    'vine', 'husk', 'shell', 'pit', 'den', 'nest', 'burrow', 'coop', 'stable',
    'pasture', 'meadow', 'orchard', 'garden', 'pond', 'ditch', 'ridge',
    'hollow', 'furrow'
]

# Common-sense categories for manual additions that might not have IsA edges
MANUAL_CATEGORIES = {
    'well': 'structure', 'fence': 'structure', 'barn': 'building',
    'creek': 'water,landscape', 'porch': 'structure', 'chimney': 'structure',
    'saddle': 'tool', 'hearth': 'structure', 'kettle': 'container',
    'plow': 'tool', 'silo': 'building', 'trough': 'container',
    'yoke': 'tool', 'anvil': 'tool', 'bellows': 'tool',
    'thimble': 'tool', 'lantern': 'tool', 'candle': 'tool',
    'broom': 'tool', 'bucket': 'container', 'ladder': 'tool',
    'rope': 'material', 'nail': 'tool', 'hay': 'plant,crop',
    'straw': 'material,crop', 'wool': 'fabric,material',
    'leather': 'fabric,material', 'tar': 'material', 'wax': 'material',
    'cork': 'material', 'flint': 'stone', 'chalk': 'material,mineral',
    'clay': 'material', 'ash': 'material', 'soot': 'material',
    'rust': 'material', 'mold': 'organism', 'moss': 'plant',
    'bark': 'plant', 'root': 'plant', 'stem': 'plant',
    'thorn': 'plant', 'vine': 'plant', 'husk': 'plant',
    'shell': 'container', 'pit': 'seed,landscape', 'den': 'shelter',
    'nest': 'shelter', 'burrow': 'shelter', 'coop': 'building',
    'stable': 'building', 'pasture': 'landscape', 'meadow': 'landscape',
    'orchard': 'landscape', 'garden': 'landscape', 'pond': 'water,landscape',
    'ditch': 'landscape', 'ridge': 'landscape', 'hollow': 'landscape',
    'furrow': 'landscape'
}

# Words to exclude (misspellings, plural forms, overly abstract, non-folksy)
EXCLUDE_WORDS = {
    'bannana', 'brocolli', 'cardimom', 'carary', 'cassorwary', 'cucmber',
    'cummin', 'dragonsnap', 'elefefant', 'guitare', 'hollie', 'potoato',
    'rhodedendron', 'sandwitch', 'saphire', 'saxiphone', 'soupd', 'tourqouise',
    'tiramisu', 'bbq', 'cajun', 'mexican', 'pepsi', 'coke', 'spam', 'accordian',
    'comealong', 'rooter', 'tweety', 'guru1', 'softball', 'nutdriver',
    'posessions', 'anus', 'bloodsucker', 'whorehouse', 'cuck',
    # Plurals when singular exists
    'blueberries', 'carrots', 'eggs', 'pears', 'peas', 'peaches', 'limes',
    'raisins', 'plums', 'rubies', 'emeralds', 'shirts', 'shoes', 'tomatoes',
    'potatoes', 'plastics', 'vegetables', 'animals', 'products', 'vertebrates',
    'pianos', 'lures', 'pens', 'crampons',
    # Too technical/non-folksy
    'bronchoscope', 'dioptometer', 'calibrachoa', 'brachycome', 'diascia',
    'osteospermum', 'nemesia', 'helichrysum', 'scavola', 'silphium',
    'cuphea', 'euonymus', 'arborvitae', 'ipomoea', 'bacopa', 'lamium',
    'falsecypress', 'boottree', 'sedimentary', 'catheter', 'caltrops',
    'argyranthemum', 'sunn',
    # Too generic/abstract
    'creature', 'invertebrate', 'primate', 'marsupial', 'crustacean',
    'arthropod', 'avian', 'amphibian', 'rodent', 'pet', 'explosive',
    'automatic', 'percussion', 'woodwind', 'laundry', 'products',
    # fictional
    'unicorn', 'dragon', 'pinguin',
    # remaining misspellings / obscure non-folksy fish
    'trumbone', 'eidar', 'monchong', 'opakapaka', 'opah', 'cumquat',
}


def connect():
    return psycopg2.connect(dbname=DB_NAME)


def step1_gather_candidates(conn):
    """Gather all English base single-word nodes that IsA our categories."""
    print("Step 1: Gathering IsA candidates...")
    cur = conn.cursor()

    category_id_list = ','.join(str(k) for k in CATEGORY_IDS.keys())

    cur.execute(f"""
        SELECT
          SUBSTRING(n_start.uri FROM 6) AS word,
          ARRAY_AGG(DISTINCT n_end.id) AS cat_ids
        FROM edges e
        JOIN nodes n_start ON e.start_id = n_start.id
        JOIN nodes n_end ON e.end_id = n_end.id
        WHERE e.relation_id = 23
          AND e.weight >= 1.0
          AND n_start.uri LIKE '/c/en/%'
          AND n_start.uri NOT LIKE '/c/en/%/%%'
          AND n_start.uri NOT LIKE '/c/en/%%\\_%%'
          AND n_end.id IN ({category_id_list})
        GROUP BY n_start.uri
    """)

    candidates = {}
    for word, cat_ids in cur.fetchall():
        if word.startswith('/'):
            word = word.lstrip('/')
        if word in EXCLUDE_WORDS:
            continue
        categories = sorted(set(CATEGORY_IDS[cid] for cid in cat_ids if cid in CATEGORY_IDS))
        candidates[word] = {
            'categories': categories,
            'tangibility_score': 0.0,
            'edge_count': 0
        }

    cur.close()
    print(f"  Found {len(candidates)} candidates after filtering")
    return candidates


def step5_add_manual(conn, candidates):
    """Add manual additions that aren't already in candidates."""
    print("Step 5: Adding manual additions...")
    added = 0
    for word in MANUAL_ADDITIONS:
        if word not in candidates:
            cats = MANUAL_CATEGORIES.get(word, 'misc').split(',')
            candidates[word] = {
                'categories': sorted(cats),
                'tangibility_score': 0.0,
                'edge_count': 0
            }
            added += 1
        else:
            # Merge manual categories with existing
            existing_cats = set(candidates[word]['categories'])
            manual_cats = set(MANUAL_CATEGORIES.get(word, '').split(',')) - {''}
            candidates[word]['categories'] = sorted(existing_cats | manual_cats)

    print(f"  Added {added} new words from manual list")
    print(f"  Total candidates: {len(candidates)}")
    return candidates


def step3_4_tangibility_and_edges(conn, candidates):
    """Calculate tangibility scores and total edge counts for all candidates."""
    print("Steps 3-4: Calculating tangibility scores and edge counts...")

    cur = conn.cursor()

    # First, get all node IDs for our candidate words in one query
    words = list(candidates.keys())
    uris = [f'/c/en/{w}' for w in words]

    # Batch lookup node IDs
    cur.execute("""
        SELECT uri, id FROM nodes
        WHERE uri = ANY(%s)
    """, (uris,))

    word_to_node_id = {}
    for uri, nid in cur.fetchall():
        word = uri[6:]  # strip '/c/en/' (Python 0-indexed: '/c/en/'=6 chars)
        word_to_node_id[word] = nid

    # Debug: show a sample
    sample = list(word_to_node_id.items())[:5]
    print(f"  Sample word->id mappings: {sample}")
    print(f"  Found node IDs for {len(word_to_node_id)}/{len(words)} words")

    # Words without node IDs - remove them
    missing = [w for w in words if w not in word_to_node_id]
    if missing:
        print(f"  Missing from DB (removing): {missing[:20]}...")
        for w in missing:
            del candidates[w]

    if not word_to_node_id:
        print("  ERROR: No node IDs found!")
        return candidates

    node_ids = list(word_to_node_id.values())
    node_id_to_word = {v: k for k, v in word_to_node_id.items()}

    concrete_rel_ids = CONCRETE_RELS
    abstract_rel_ids = ABSTRACT_RELS
    all_scored_rels = concrete_rel_ids + abstract_rel_ids

    # Query: for each node (as start or end), count concrete and abstract edges
    # We need English-only counterparts, so we filter the other end to /c/en/
    # Do this in batches to avoid memory issues

    batch_size = 200
    node_id_list = list(node_ids)

    for batch_start in range(0, len(node_id_list), batch_size):
        batch = node_id_list[batch_start:batch_start + batch_size]
        batch_words = [node_id_to_word[nid] for nid in batch]

        if batch_start % 1000 == 0:
            print(f"  Processing batch {batch_start}/{len(node_id_list)}...")

        # Concrete relation counts (as start node)
        cur.execute("""
            SELECT e.start_id, e.relation_id, COUNT(*)
            FROM edges e
            JOIN nodes n_other ON e.end_id = n_other.id
            WHERE e.start_id = ANY(%s)
              AND e.weight >= 1.0
              AND e.relation_id = ANY(%s)
              AND n_other.uri LIKE '/c/en/%%'
            GROUP BY e.start_id, e.relation_id
        """, (batch, all_scored_rels))

        for nid, rel_id, cnt in cur.fetchall():
            word = node_id_to_word.get(nid)
            if not word or word not in candidates:
                continue
            if rel_id in concrete_rel_ids:
                candidates[word].setdefault('concrete_count', 0)
                candidates[word]['concrete_count'] += cnt
            elif rel_id in abstract_rel_ids:
                candidates[word].setdefault('abstract_count', 0)
                candidates[word]['abstract_count'] += cnt

        # As end node
        cur.execute("""
            SELECT e.end_id, e.relation_id, COUNT(*)
            FROM edges e
            JOIN nodes n_other ON e.start_id = n_other.id
            WHERE e.end_id = ANY(%s)
              AND e.weight >= 1.0
              AND e.relation_id = ANY(%s)
              AND n_other.uri LIKE '/c/en/%%'
            GROUP BY e.end_id, e.relation_id
        """, (batch, all_scored_rels))

        for nid, rel_id, cnt in cur.fetchall():
            word = node_id_to_word.get(nid)
            if not word or word not in candidates:
                continue
            if rel_id in concrete_rel_ids:
                candidates[word].setdefault('concrete_count', 0)
                candidates[word]['concrete_count'] += cnt
            elif rel_id in abstract_rel_ids:
                candidates[word].setdefault('abstract_count', 0)
                candidates[word]['abstract_count'] += cnt

        # Total edge count (any relation, English counterpart, weight >= 1)
        cur.execute("""
            SELECT start_id, COUNT(*)
            FROM edges e
            JOIN nodes n_other ON e.end_id = n_other.id
            WHERE e.start_id = ANY(%s)
              AND e.weight >= 1.0
              AND n_other.uri LIKE '/c/en/%%'
            GROUP BY start_id
        """, (batch,))

        for nid, cnt in cur.fetchall():
            word = node_id_to_word.get(nid)
            if word and word in candidates:
                candidates[word]['edge_count'] += cnt

        cur.execute("""
            SELECT end_id, COUNT(*)
            FROM edges e
            JOIN nodes n_other ON e.start_id = n_other.id
            WHERE e.end_id = ANY(%s)
              AND e.weight >= 1.0
              AND n_other.uri LIKE '/c/en/%%'
            GROUP BY end_id
        """, (batch,))

        for nid, cnt in cur.fetchall():
            word = node_id_to_word.get(nid)
            if word and word in candidates:
                candidates[word]['edge_count'] += cnt

    # Calculate tangibility scores
    for word, data in candidates.items():
        concrete = data.get('concrete_count', 0)
        abstract = data.get('abstract_count', 0)
        total = concrete + abstract
        if total > 0:
            data['tangibility_score'] = round(concrete / total, 2)
        else:
            data['tangibility_score'] = 0.0

    cur.close()
    return candidates


def step6_write_output(candidates):
    """Write the final CSV."""
    output_path = '/home/john/Development/folksy-generator/data/folksy_vocab.csv'
    print(f"Step 6: Writing output to {output_path}")

    # Sort by edge_count descending
    sorted_words = sorted(candidates.items(), key=lambda x: x[1]['edge_count'], reverse=True)

    with open(output_path, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['word', 'categories', 'tangibility_score', 'conceptnet_edge_count', 'frequency_rank'])

        for word, data in sorted_words:
            categories = ','.join(data['categories'])
            writer.writerow([
                word,
                categories,
                data['tangibility_score'],
                data['edge_count'],
                0
            ])

    print(f"  Wrote {len(sorted_words)} words")
    return output_path


def main():
    conn = connect()
    try:
        # Step 1 + 2 (filtering is built into the SQL)
        candidates = step1_gather_candidates(conn)

        # Step 5: Add manual additions (before scoring so they get scored too)
        candidates = step5_add_manual(conn, candidates)

        # Steps 3 + 4: Tangibility and edge counts
        candidates = step3_4_tangibility_and_edges(conn, candidates)

        # Step 6: Write output
        path = step6_write_output(candidates)

        # Summary stats
        scores = [d['tangibility_score'] for d in candidates.values() if d['tangibility_score'] > 0]
        edges = [d['edge_count'] for d in candidates.values()]
        print(f"\nSummary:")
        print(f"  Total words: {len(candidates)}")
        print(f"  Words with tangibility > 0: {len(scores)}")
        if scores:
            print(f"  Avg tangibility: {sum(scores)/len(scores):.2f}")
        if edges:
            print(f"  Avg edge count: {sum(edges)/len(edges):.1f}")
            print(f"  Max edge count: {max(edges)}")
            print(f"  Min edge count: {min(edges)}")
        print(f"  Output: {path}")

    finally:
        conn.close()


if __name__ == '__main__':
    main()
Initial 'folksy idiom' generator 2026-02-15 14:04:25 -05:00			`#!/usr/bin/env python3`
			`"""`
			`Build folksy vocabulary CSV from ConceptNet5 PostgreSQL database.`
			`Steps:`
			`1. Gather candidates via IsA categories`
			`2. Filter to single-word concrete nouns`
			`3. Calculate tangibility score`
			`4. Count total edges`
			`5. Add manual additions`
			`6. Write output CSV`
			`"""`

			`import psycopg2`
			`import csv`
			`import sys`

			`DB_NAME = "conceptnet5"`

			`# IsA categories and their node IDs (pre-looked up)`
			`CATEGORY_IDS = {`
			`20865: 'animal', 22802: 'beverage', 20866: 'bird', 40218: 'building',`
			`21578: 'clothing', 144957: 'container', 26028: 'crop', 148705: 'fabric',`
			`22922: 'fish', 26249: 'flower', 22803: 'food', 31187: 'fruit',`
			`22610: 'furniture', 114948: 'grain', 641297: 'herb', 152432: 'insect',`
			`152437: 'instrument', 153470: 'livestock', 33562: 'material',`
			`25893: 'metal', 20869: 'mineral', 20872: 'plant', 37511: 'rock',`
			`25753: 'seed', 44101: 'spice', 37357: 'stone', 159255: 'tool',`
			`40174: 'tree', 20874: 'vegetable', 144388: 'vehicle', 156331: 'weapon',`
			`31507: 'wood'`
			`}`

			`# Relation IDs (pre-looked up from relations table)`
			`RELATION_IDS = {`
			`'AtLocation': 1, 'MadeOf': 25, 'PartOf': 33, 'UsedFor': 39,`
			`'HasA': 15, 'ReceivesAction': 34, 'CreatedBy': 5,`
			`'HasProperty': 20, 'Causes': 3, 'MotivatedByGoal': 27,`
			`'CausesDesire': 4, 'Desires': 8, 'HasSubevent': 21`
			`}`

			`CONCRETE_RELS = [RELATION_IDS[r] for r in ['AtLocation', 'MadeOf', 'PartOf', 'UsedFor', 'HasA', 'ReceivesAction', 'CreatedBy']]`
			`ABSTRACT_RELS = [RELATION_IDS[r] for r in ['HasProperty', 'Causes', 'MotivatedByGoal', 'CausesDesire', 'Desires', 'HasSubevent']]`

			`MANUAL_ADDITIONS = [`
			`'well', 'fence', 'barn', 'creek', 'porch', 'chimney', 'saddle', 'hearth',`
			`'kettle', 'plow', 'silo', 'trough', 'yoke', 'anvil', 'bellows', 'thimble',`
			`'lantern', 'candle', 'broom', 'bucket', 'ladder', 'rope', 'nail', 'hay',`
			`'straw', 'wool', 'leather', 'tar', 'wax', 'cork', 'flint', 'chalk', 'clay',`
			`'ash', 'soot', 'rust', 'mold', 'moss', 'bark', 'root', 'stem', 'thorn',`
			`'vine', 'husk', 'shell', 'pit', 'den', 'nest', 'burrow', 'coop', 'stable',`
			`'pasture', 'meadow', 'orchard', 'garden', 'pond', 'ditch', 'ridge',`
			`'hollow', 'furrow'`
			`]`

			`# Common-sense categories for manual additions that might not have IsA edges`
			`MANUAL_CATEGORIES = {`
			`'well': 'structure', 'fence': 'structure', 'barn': 'building',`
			`'creek': 'water,landscape', 'porch': 'structure', 'chimney': 'structure',`
			`'saddle': 'tool', 'hearth': 'structure', 'kettle': 'container',`
			`'plow': 'tool', 'silo': 'building', 'trough': 'container',`
			`'yoke': 'tool', 'anvil': 'tool', 'bellows': 'tool',`
			`'thimble': 'tool', 'lantern': 'tool', 'candle': 'tool',`
			`'broom': 'tool', 'bucket': 'container', 'ladder': 'tool',`
			`'rope': 'material', 'nail': 'tool', 'hay': 'plant,crop',`
			`'straw': 'material,crop', 'wool': 'fabric,material',`
			`'leather': 'fabric,material', 'tar': 'material', 'wax': 'material',`
			`'cork': 'material', 'flint': 'stone', 'chalk': 'material,mineral',`
			`'clay': 'material', 'ash': 'material', 'soot': 'material',`
			`'rust': 'material', 'mold': 'organism', 'moss': 'plant',`
			`'bark': 'plant', 'root': 'plant', 'stem': 'plant',`
			`'thorn': 'plant', 'vine': 'plant', 'husk': 'plant',`
			`'shell': 'container', 'pit': 'seed,landscape', 'den': 'shelter',`
			`'nest': 'shelter', 'burrow': 'shelter', 'coop': 'building',`
			`'stable': 'building', 'pasture': 'landscape', 'meadow': 'landscape',`
			`'orchard': 'landscape', 'garden': 'landscape', 'pond': 'water,landscape',`
			`'ditch': 'landscape', 'ridge': 'landscape', 'hollow': 'landscape',`
			`'furrow': 'landscape'`
			`}`

			`# Words to exclude (misspellings, plural forms, overly abstract, non-folksy)`
			`EXCLUDE_WORDS = {`
			`'bannana', 'brocolli', 'cardimom', 'carary', 'cassorwary', 'cucmber',`
			`'cummin', 'dragonsnap', 'elefefant', 'guitare', 'hollie', 'potoato',`
			`'rhodedendron', 'sandwitch', 'saphire', 'saxiphone', 'soupd', 'tourqouise',`
			`'tiramisu', 'bbq', 'cajun', 'mexican', 'pepsi', 'coke', 'spam', 'accordian',`
			`'comealong', 'rooter', 'tweety', 'guru1', 'softball', 'nutdriver',`
			`'posessions', 'anus', 'bloodsucker', 'whorehouse', 'cuck',`
			`# Plurals when singular exists`
			`'blueberries', 'carrots', 'eggs', 'pears', 'peas', 'peaches', 'limes',`
			`'raisins', 'plums', 'rubies', 'emeralds', 'shirts', 'shoes', 'tomatoes',`
			`'potatoes', 'plastics', 'vegetables', 'animals', 'products', 'vertebrates',`
			`'pianos', 'lures', 'pens', 'crampons',`
			`# Too technical/non-folksy`
			`'bronchoscope', 'dioptometer', 'calibrachoa', 'brachycome', 'diascia',`
			`'osteospermum', 'nemesia', 'helichrysum', 'scavola', 'silphium',`
			`'cuphea', 'euonymus', 'arborvitae', 'ipomoea', 'bacopa', 'lamium',`
			`'falsecypress', 'boottree', 'sedimentary', 'catheter', 'caltrops',`
			`'argyranthemum', 'sunn',`
			`# Too generic/abstract`
			`'creature', 'invertebrate', 'primate', 'marsupial', 'crustacean',`
			`'arthropod', 'avian', 'amphibian', 'rodent', 'pet', 'explosive',`
			`'automatic', 'percussion', 'woodwind', 'laundry', 'products',`
			`# fictional`
			`'unicorn', 'dragon', 'pinguin',`
			`# remaining misspellings / obscure non-folksy fish`
			`'trumbone', 'eidar', 'monchong', 'opakapaka', 'opah', 'cumquat',`
			`}`


			`def connect():`
			`return psycopg2.connect(dbname=DB_NAME)`


			`def step1_gather_candidates(conn):`
			`"""Gather all English base single-word nodes that IsA our categories."""`
			`print("Step 1: Gathering IsA candidates...")`
			`cur = conn.cursor()`

			`category_id_list = ','.join(str(k) for k in CATEGORY_IDS.keys())`

			`cur.execute(f"""`
			`SELECT`
			`SUBSTRING(n_start.uri FROM 6) AS word,`
			`ARRAY_AGG(DISTINCT n_end.id) AS cat_ids`
			`FROM edges e`
			`JOIN nodes n_start ON e.start_id = n_start.id`
			`JOIN nodes n_end ON e.end_id = n_end.id`
			`WHERE e.relation_id = 23`
			`AND e.weight >= 1.0`
			`AND n_start.uri LIKE '/c/en/%'`
			`AND n_start.uri NOT LIKE '/c/en/%/%%'`
			`AND n_start.uri NOT LIKE '/c/en/%%\\_%%'`
			`AND n_end.id IN ({category_id_list})`
			`GROUP BY n_start.uri`
			`""")`

			`candidates = {}`
			`for word, cat_ids in cur.fetchall():`
			`if word.startswith('/'):`
			`word = word.lstrip('/')`
			`if word in EXCLUDE_WORDS:`
			`continue`
			`categories = sorted(set(CATEGORY_IDS[cid] for cid in cat_ids if cid in CATEGORY_IDS))`
			`candidates[word] = {`
			`'categories': categories,`
			`'tangibility_score': 0.0,`
			`'edge_count': 0`
			`}`

			`cur.close()`
			`print(f" Found {len(candidates)} candidates after filtering")`
			`return candidates`


			`def step5_add_manual(conn, candidates):`
			`"""Add manual additions that aren't already in candidates."""`
			`print("Step 5: Adding manual additions...")`
			`added = 0`
			`for word in MANUAL_ADDITIONS:`
			`if word not in candidates:`
			`cats = MANUAL_CATEGORIES.get(word, 'misc').split(',')`
			`candidates[word] = {`
			`'categories': sorted(cats),`
			`'tangibility_score': 0.0,`
			`'edge_count': 0`
			`}`
			`added += 1`
			`else:`
			`# Merge manual categories with existing`
			`existing_cats = set(candidates[word]['categories'])`
			`manual_cats = set(MANUAL_CATEGORIES.get(word, '').split(',')) - {''}`
			`candidates[word]['categories'] = sorted(existing_cats \| manual_cats)`

			`print(f" Added {added} new words from manual list")`
			`print(f" Total candidates: {len(candidates)}")`
			`return candidates`


			`def step3_4_tangibility_and_edges(conn, candidates):`
			`"""Calculate tangibility scores and total edge counts for all candidates."""`
			`print("Steps 3-4: Calculating tangibility scores and edge counts...")`

			`cur = conn.cursor()`

			`# First, get all node IDs for our candidate words in one query`
			`words = list(candidates.keys())`
			`uris = [f'/c/en/{w}' for w in words]`

			`# Batch lookup node IDs`
			`cur.execute("""`
			`SELECT uri, id FROM nodes`
			`WHERE uri = ANY(%s)`
			`""", (uris,))`

			`word_to_node_id = {}`
			`for uri, nid in cur.fetchall():`
			`word = uri[6:] # strip '/c/en/' (Python 0-indexed: '/c/en/'=6 chars)`
			`word_to_node_id[word] = nid`

			`# Debug: show a sample`
			`sample = list(word_to_node_id.items())[:5]`
			`print(f" Sample word->id mappings: {sample}")`
			`print(f" Found node IDs for {len(word_to_node_id)}/{len(words)} words")`

			`# Words without node IDs - remove them`
			`missing = [w for w in words if w not in word_to_node_id]`
			`if missing:`
			`print(f" Missing from DB (removing): {missing[:20]}...")`
			`for w in missing:`
			`del candidates[w]`

			`if not word_to_node_id:`
			`print(" ERROR: No node IDs found!")`
			`return candidates`

			`node_ids = list(word_to_node_id.values())`
			`node_id_to_word = {v: k for k, v in word_to_node_id.items()}`

			`concrete_rel_ids = CONCRETE_RELS`
			`abstract_rel_ids = ABSTRACT_RELS`
			`all_scored_rels = concrete_rel_ids + abstract_rel_ids`

			`# Query: for each node (as start or end), count concrete and abstract edges`
			`# We need English-only counterparts, so we filter the other end to /c/en/`
			`# Do this in batches to avoid memory issues`

			`batch_size = 200`
			`node_id_list = list(node_ids)`

			`for batch_start in range(0, len(node_id_list), batch_size):`
			`batch = node_id_list[batch_start:batch_start + batch_size]`
			`batch_words = [node_id_to_word[nid] for nid in batch]`

			`if batch_start % 1000 == 0:`
			`print(f" Processing batch {batch_start}/{len(node_id_list)}...")`

			`# Concrete relation counts (as start node)`
			`cur.execute("""`
			`SELECT e.start_id, e.relation_id, COUNT(*)`
			`FROM edges e`
			`JOIN nodes n_other ON e.end_id = n_other.id`
			`WHERE e.start_id = ANY(%s)`
			`AND e.weight >= 1.0`
			`AND e.relation_id = ANY(%s)`
			`AND n_other.uri LIKE '/c/en/%%'`
			`GROUP BY e.start_id, e.relation_id`
			`""", (batch, all_scored_rels))`

			`for nid, rel_id, cnt in cur.fetchall():`
			`word = node_id_to_word.get(nid)`
			`if not word or word not in candidates:`
			`continue`
			`if rel_id in concrete_rel_ids:`
			`candidates[word].setdefault('concrete_count', 0)`
			`candidates[word]['concrete_count'] += cnt`
			`elif rel_id in abstract_rel_ids:`
			`candidates[word].setdefault('abstract_count', 0)`
			`candidates[word]['abstract_count'] += cnt`

			`# As end node`
			`cur.execute("""`
			`SELECT e.end_id, e.relation_id, COUNT(*)`
			`FROM edges e`
			`JOIN nodes n_other ON e.start_id = n_other.id`
			`WHERE e.end_id = ANY(%s)`
			`AND e.weight >= 1.0`
			`AND e.relation_id = ANY(%s)`
			`AND n_other.uri LIKE '/c/en/%%'`
			`GROUP BY e.end_id, e.relation_id`
			`""", (batch, all_scored_rels))`

			`for nid, rel_id, cnt in cur.fetchall():`
			`word = node_id_to_word.get(nid)`
			`if not word or word not in candidates:`
			`continue`
			`if rel_id in concrete_rel_ids:`
			`candidates[word].setdefault('concrete_count', 0)`
			`candidates[word]['concrete_count'] += cnt`
			`elif rel_id in abstract_rel_ids:`
			`candidates[word].setdefault('abstract_count', 0)`
			`candidates[word]['abstract_count'] += cnt`

			`# Total edge count (any relation, English counterpart, weight >= 1)`
			`cur.execute("""`
			`SELECT start_id, COUNT(*)`
			`FROM edges e`
			`JOIN nodes n_other ON e.end_id = n_other.id`
			`WHERE e.start_id = ANY(%s)`
			`AND e.weight >= 1.0`
			`AND n_other.uri LIKE '/c/en/%%'`
			`GROUP BY start_id`
			`""", (batch,))`

			`for nid, cnt in cur.fetchall():`
			`word = node_id_to_word.get(nid)`
			`if word and word in candidates:`
			`candidates[word]['edge_count'] += cnt`

			`cur.execute("""`
			`SELECT end_id, COUNT(*)`
			`FROM edges e`
			`JOIN nodes n_other ON e.start_id = n_other.id`
			`WHERE e.end_id = ANY(%s)`
			`AND e.weight >= 1.0`
			`AND n_other.uri LIKE '/c/en/%%'`
			`GROUP BY end_id`
			`""", (batch,))`

			`for nid, cnt in cur.fetchall():`
			`word = node_id_to_word.get(nid)`
			`if word and word in candidates:`
			`candidates[word]['edge_count'] += cnt`

			`# Calculate tangibility scores`
			`for word, data in candidates.items():`
			`concrete = data.get('concrete_count', 0)`
			`abstract = data.get('abstract_count', 0)`
			`total = concrete + abstract`
			`if total > 0:`
			`data['tangibility_score'] = round(concrete / total, 2)`
			`else:`
			`data['tangibility_score'] = 0.0`

			`cur.close()`
			`return candidates`


			`def step6_write_output(candidates):`
			`"""Write the final CSV."""`
			`output_path = '/home/john/Development/folksy-generator/data/folksy_vocab.csv'`
			`print(f"Step 6: Writing output to {output_path}")`

			`# Sort by edge_count descending`
			`sorted_words = sorted(candidates.items(), key=lambda x: x[1]['edge_count'], reverse=True)`

			`with open(output_path, 'w', newline='') as f:`
			`writer = csv.writer(f)`
			`writer.writerow(['word', 'categories', 'tangibility_score', 'conceptnet_edge_count', 'frequency_rank'])`

			`for word, data in sorted_words:`
			`categories = ','.join(data['categories'])`
			`writer.writerow([`
			`word,`
			`categories,`
			`data['tangibility_score'],`
			`data['edge_count'],`
			`0`
			`])`

			`print(f" Wrote {len(sorted_words)} words")`
			`return output_path`


			`def main():`
			`conn = connect()`
			`try:`
			`# Step 1 + 2 (filtering is built into the SQL)`
			`candidates = step1_gather_candidates(conn)`

			`# Step 5: Add manual additions (before scoring so they get scored too)`
			`candidates = step5_add_manual(conn, candidates)`

			`# Steps 3 + 4: Tangibility and edge counts`
			`candidates = step3_4_tangibility_and_edges(conn, candidates)`

			`# Step 6: Write output`
			`path = step6_write_output(candidates)`

			`# Summary stats`
			`scores = [d['tangibility_score'] for d in candidates.values() if d['tangibility_score'] > 0]`
			`edges = [d['edge_count'] for d in candidates.values()]`
			`print(f"\nSummary:")`
			`print(f" Total words: {len(candidates)}")`
			`print(f" Words with tangibility > 0: {len(scores)}")`
			`if scores:`
			`print(f" Avg tangibility: {sum(scores)/len(scores):.2f}")`
			`if edges:`
			`print(f" Avg edge count: {sum(edges)/len(edges):.1f}")`
			`print(f" Max edge count: {max(edges)}")`
			`print(f" Min edge count: {min(edges)}")`
			`print(f" Output: {path}")`

			`finally:`
			`conn.close()`


			`if __name__ == '__main__':`
			`main()`