#!/usr/bin/env python3
"""
Build folksy vocabulary CSV from ConceptNet5 PostgreSQL database.
Steps:
1. Gather candidates via IsA categories
2. Filter to single-word concrete nouns
3. Calculate tangibility score
4. Count total edges
5. Add manual additions
6. Write output CSV
"""

import psycopg2
import csv
import sys

DB_NAME = "conceptnet5"

# IsA categories and their node IDs (pre-looked up)
CATEGORY_IDS = {
    20865: 'animal', 22802: 'beverage', 20866: 'bird', 40218: 'building',
    21578: 'clothing', 144957: 'container', 26028: 'crop', 148705: 'fabric',
    22922: 'fish', 26249: 'flower', 22803: 'food', 31187: 'fruit',
    22610: 'furniture', 114948: 'grain', 641297: 'herb', 152432: 'insect',
    152437: 'instrument', 153470: 'livestock', 33562: 'material',
    25893: 'metal', 20869: 'mineral', 20872: 'plant', 37511: 'rock',
    25753: 'seed', 44101: 'spice', 37357: 'stone', 159255: 'tool',
    40174: 'tree', 20874: 'vegetable', 144388: 'vehicle', 156331: 'weapon',
    31507: 'wood'
}

# Relation IDs (pre-looked up from relations table)
RELATION_IDS = {
    'AtLocation': 1, 'MadeOf': 25, 'PartOf': 33, 'UsedFor': 39,
    'HasA': 15, 'ReceivesAction': 34, 'CreatedBy': 5,
    'HasProperty': 20, 'Causes': 3, 'MotivatedByGoal': 27,
    'CausesDesire': 4, 'Desires': 8, 'HasSubevent': 21
}

CONCRETE_RELS = [RELATION_IDS[r] for r in ['AtLocation', 'MadeOf', 'PartOf', 'UsedFor', 'HasA', 'ReceivesAction', 'CreatedBy']]
ABSTRACT_RELS = [RELATION_IDS[r] for r in ['HasProperty', 'Causes', 'MotivatedByGoal', 'CausesDesire', 'Desires', 'HasSubevent']]

MANUAL_ADDITIONS = [
    'well', 'fence', 'barn', 'creek', 'porch', 'chimney', 'saddle', 'hearth',
    'kettle', 'plow', 'silo', 'trough', 'yoke', 'anvil', 'bellows', 'thimble',
    'lantern', 'candle', 'broom', 'bucket', 'ladder', 'rope', 'nail', 'hay',
    'straw', 'wool', 'leather', 'tar', 'wax', 'cork', 'flint', 'chalk', 'clay',
    'ash', 'soot', 'rust', 'mold', 'moss', 'bark', 'root', 'stem', 'thorn',
    'vine', 'husk', 'shell', 'pit', 'den', 'nest', 'burrow', 'coop', 'stable',
    'pasture', 'meadow', 'orchard', 'garden', 'pond', 'ditch', 'ridge',
    'hollow', 'furrow'
]

# Common-sense categories for manual additions that might not have IsA edges
MANUAL_CATEGORIES = {
    'well': 'structure', 'fence': 'structure', 'barn': 'building',
    'creek': 'water,landscape', 'porch': 'structure', 'chimney': 'structure',
    'saddle': 'tool', 'hearth': 'structure', 'kettle': 'container',
    'plow': 'tool', 'silo': 'building', 'trough': 'container',
    'yoke': 'tool', 'anvil': 'tool', 'bellows': 'tool',
    'thimble': 'tool', 'lantern': 'tool', 'candle': 'tool',
    'broom': 'tool', 'bucket': 'container', 'ladder': 'tool',
    'rope': 'material', 'nail': 'tool', 'hay': 'plant,crop',
    'straw': 'material,crop', 'wool': 'fabric,material',
    'leather': 'fabric,material', 'tar': 'material', 'wax': 'material',
    'cork': 'material', 'flint': 'stone', 'chalk': 'material,mineral',
    'clay': 'material', 'ash': 'material', 'soot': 'material',
    'rust': 'material', 'mold': 'organism', 'moss': 'plant',
    'bark': 'plant', 'root': 'plant', 'stem': 'plant',
    'thorn': 'plant', 'vine': 'plant', 'husk': 'plant',
    'shell': 'container', 'pit': 'seed,landscape', 'den': 'shelter',
    'nest': 'shelter', 'burrow': 'shelter', 'coop': 'building',
    'stable': 'building', 'pasture': 'landscape', 'meadow': 'landscape',
    'orchard': 'landscape', 'garden': 'landscape', 'pond': 'water,landscape',
    'ditch': 'landscape', 'ridge': 'landscape', 'hollow': 'landscape',
    'furrow': 'landscape'
}

# Words to exclude (misspellings, plural forms, overly abstract, non-folksy)
EXCLUDE_WORDS = {
    'bannana', 'brocolli', 'cardimom', 'carary', 'cassorwary', 'cucmber',
    'cummin', 'dragonsnap', 'elefefant', 'guitare', 'hollie', 'potoato',
    'rhodedendron', 'sandwitch', 'saphire', 'saxiphone', 'soupd', 'tourqouise',
    'tiramisu', 'bbq', 'cajun', 'mexican', 'pepsi', 'coke', 'spam', 'accordian',
    'comealong', 'rooter', 'tweety', 'guru1', 'softball', 'nutdriver',
    'posessions', 'anus', 'bloodsucker', 'whorehouse', 'cuck',
    # Plurals when singular exists
    'blueberries', 'carrots', 'eggs', 'pears', 'peas', 'peaches', 'limes',
    'raisins', 'plums', 'rubies', 'emeralds', 'shirts', 'shoes', 'tomatoes',
    'potatoes', 'plastics', 'vegetables', 'animals', 'products', 'vertebrates',
    'pianos', 'lures', 'pens', 'crampons',
    # Too technical/non-folksy
    'bronchoscope', 'dioptometer', 'calibrachoa', 'brachycome', 'diascia',
    'osteospermum', 'nemesia', 'helichrysum', 'scavola', 'silphium',
    'cuphea', 'euonymus', 'arborvitae', 'ipomoea', 'bacopa', 'lamium',
    'falsecypress', 'boottree', 'sedimentary', 'catheter', 'caltrops',
    'argyranthemum', 'sunn',
    # Too generic/abstract
    'creature', 'invertebrate', 'primate', 'marsupial', 'crustacean',
    'arthropod', 'avian', 'amphibian', 'rodent', 'pet', 'explosive',
    'automatic', 'percussion', 'woodwind', 'laundry', 'products',
    # fictional
    'unicorn', 'dragon', 'pinguin',
    # remaining misspellings / obscure non-folksy fish
    'trumbone', 'eidar', 'monchong', 'opakapaka', 'opah', 'cumquat',
}


def connect():
    return psycopg2.connect(dbname=DB_NAME)


def step1_gather_candidates(conn):
    """Gather all English base single-word nodes that IsA our categories."""
    print("Step 1: Gathering IsA candidates...")
    cur = conn.cursor()

    category_id_list = ','.join(str(k) for k in CATEGORY_IDS.keys())

    cur.execute(f"""
        SELECT
          SUBSTRING(n_start.uri FROM 6) AS word,
          ARRAY_AGG(DISTINCT n_end.id) AS cat_ids
        FROM edges e
        JOIN nodes n_start ON e.start_id = n_start.id
        JOIN nodes n_end ON e.end_id = n_end.id
        WHERE e.relation_id = 23
          AND e.weight >= 1.0
          AND n_start.uri LIKE '/c/en/%'
          AND n_start.uri NOT LIKE '/c/en/%/%%'
          AND n_start.uri NOT LIKE '/c/en/%%\\_%%'
          AND n_end.id IN ({category_id_list})
        GROUP BY n_start.uri
    """)

    candidates = {}
    for word, cat_ids in cur.fetchall():
        if word.startswith('/'):
            word = word.lstrip('/')
        if word in EXCLUDE_WORDS:
            continue
        categories = sorted(set(CATEGORY_IDS[cid] for cid in cat_ids if cid in CATEGORY_IDS))
        candidates[word] = {
            'categories': categories,
            'tangibility_score': 0.0,
            'edge_count': 0
        }

    cur.close()
    print(f"  Found {len(candidates)} candidates after filtering")
    return candidates


def step5_add_manual(conn, candidates):
    """Add manual additions that aren't already in candidates."""
    print("Step 5: Adding manual additions...")
    added = 0
    for word in MANUAL_ADDITIONS:
        if word not in candidates:
            cats = MANUAL_CATEGORIES.get(word, 'misc').split(',')
            candidates[word] = {
                'categories': sorted(cats),
                'tangibility_score': 0.0,
                'edge_count': 0
            }
            added += 1
        else:
            # Merge manual categories with existing
            existing_cats = set(candidates[word]['categories'])
            manual_cats = set(MANUAL_CATEGORIES.get(word, '').split(',')) - {''}
            candidates[word]['categories'] = sorted(existing_cats | manual_cats)

    print(f"  Added {added} new words from manual list")
    print(f"  Total candidates: {len(candidates)}")
    return candidates


def step3_4_tangibility_and_edges(conn, candidates):
    """Calculate tangibility scores and total edge counts for all candidates."""
    print("Steps 3-4: Calculating tangibility scores and edge counts...")

    cur = conn.cursor()

    # First, get all node IDs for our candidate words in one query
    words = list(candidates.keys())
    uris = [f'/c/en/{w}' for w in words]

    # Batch lookup node IDs
    cur.execute("""
        SELECT uri, id FROM nodes
        WHERE uri = ANY(%s)
    """, (uris,))

    word_to_node_id = {}
    for uri, nid in cur.fetchall():
        word = uri[6:]  # strip '/c/en/' (Python 0-indexed: '/c/en/'=6 chars)
        word_to_node_id[word] = nid

    # Debug: show a sample
    sample = list(word_to_node_id.items())[:5]
    print(f"  Sample word->id mappings: {sample}")
    print(f"  Found node IDs for {len(word_to_node_id)}/{len(words)} words")

    # Words without node IDs - remove them
    missing = [w for w in words if w not in word_to_node_id]
    if missing:
        print(f"  Missing from DB (removing): {missing[:20]}...")
        for w in missing:
            del candidates[w]

    if not word_to_node_id:
        print("  ERROR: No node IDs found!")
        return candidates

    node_ids = list(word_to_node_id.values())
    node_id_to_word = {v: k for k, v in word_to_node_id.items()}

    concrete_rel_ids = CONCRETE_RELS
    abstract_rel_ids = ABSTRACT_RELS
    all_scored_rels = concrete_rel_ids + abstract_rel_ids

    # Query: for each node (as start or end), count concrete and abstract edges
    # We need English-only counterparts, so we filter the other end to /c/en/
    # Do this in batches to avoid memory issues

    batch_size = 200
    node_id_list = list(node_ids)

    for batch_start in range(0, len(node_id_list), batch_size):
        batch = node_id_list[batch_start:batch_start + batch_size]
        batch_words = [node_id_to_word[nid] for nid in batch]

        if batch_start % 1000 == 0:
            print(f"  Processing batch {batch_start}/{len(node_id_list)}...")

        # Concrete relation counts (as start node)
        cur.execute("""
            SELECT e.start_id, e.relation_id, COUNT(*)
            FROM edges e
            JOIN nodes n_other ON e.end_id = n_other.id
            WHERE e.start_id = ANY(%s)
              AND e.weight >= 1.0
              AND e.relation_id = ANY(%s)
              AND n_other.uri LIKE '/c/en/%%'
            GROUP BY e.start_id, e.relation_id
        """, (batch, all_scored_rels))

        for nid, rel_id, cnt in cur.fetchall():
            word = node_id_to_word.get(nid)
            if not word or word not in candidates:
                continue
            if rel_id in concrete_rel_ids:
                candidates[word].setdefault('concrete_count', 0)
                candidates[word]['concrete_count'] += cnt
            elif rel_id in abstract_rel_ids:
                candidates[word].setdefault('abstract_count', 0)
                candidates[word]['abstract_count'] += cnt

        # As end node
        cur.execute("""
            SELECT e.end_id, e.relation_id, COUNT(*)
            FROM edges e
            JOIN nodes n_other ON e.start_id = n_other.id
            WHERE e.end_id = ANY(%s)
              AND e.weight >= 1.0
              AND e.relation_id = ANY(%s)
              AND n_other.uri LIKE '/c/en/%%'
            GROUP BY e.end_id, e.relation_id
        """, (batch, all_scored_rels))

        for nid, rel_id, cnt in cur.fetchall():
            word = node_id_to_word.get(nid)
            if not word or word not in candidates:
                continue
            if rel_id in concrete_rel_ids:
                candidates[word].setdefault('concrete_count', 0)
                candidates[word]['concrete_count'] += cnt
            elif rel_id in abstract_rel_ids:
                candidates[word].setdefault('abstract_count', 0)
                candidates[word]['abstract_count'] += cnt

        # Total edge count (any relation, English counterpart, weight >= 1)
        cur.execute("""
            SELECT start_id, COUNT(*)
            FROM edges e
            JOIN nodes n_other ON e.end_id = n_other.id
            WHERE e.start_id = ANY(%s)
              AND e.weight >= 1.0
              AND n_other.uri LIKE '/c/en/%%'
            GROUP BY start_id
        """, (batch,))

        for nid, cnt in cur.fetchall():
            word = node_id_to_word.get(nid)
            if word and word in candidates:
                candidates[word]['edge_count'] += cnt

        cur.execute("""
            SELECT end_id, COUNT(*)
            FROM edges e
            JOIN nodes n_other ON e.start_id = n_other.id
            WHERE e.end_id = ANY(%s)
              AND e.weight >= 1.0
              AND n_other.uri LIKE '/c/en/%%'
            GROUP BY end_id
        """, (batch,))

        for nid, cnt in cur.fetchall():
            word = node_id_to_word.get(nid)
            if word and word in candidates:
                candidates[word]['edge_count'] += cnt

    # Calculate tangibility scores
    for word, data in candidates.items():
        concrete = data.get('concrete_count', 0)
        abstract = data.get('abstract_count', 0)
        total = concrete + abstract
        if total > 0:
            data['tangibility_score'] = round(concrete / total, 2)
        else:
            data['tangibility_score'] = 0.0

    cur.close()
    return candidates


def step6_write_output(candidates):
    """Write the final CSV."""
    output_path = '/home/john/Development/folksy-generator/data/folksy_vocab.csv'
    print(f"Step 6: Writing output to {output_path}")

    # Sort by edge_count descending
    sorted_words = sorted(candidates.items(), key=lambda x: x[1]['edge_count'], reverse=True)

    with open(output_path, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['word', 'categories', 'tangibility_score', 'conceptnet_edge_count', 'frequency_rank'])

        for word, data in sorted_words:
            categories = ','.join(data['categories'])
            writer.writerow([
                word,
                categories,
                data['tangibility_score'],
                data['edge_count'],
                0
            ])

    print(f"  Wrote {len(sorted_words)} words")
    return output_path


def main():
    conn = connect()
    try:
        # Step 1 + 2 (filtering is built into the SQL)
        candidates = step1_gather_candidates(conn)

        # Step 5: Add manual additions (before scoring so they get scored too)
        candidates = step5_add_manual(conn, candidates)

        # Steps 3 + 4: Tangibility and edge counts
        candidates = step3_4_tangibility_and_edges(conn, candidates)

        # Step 6: Write output
        path = step6_write_output(candidates)

        # Summary stats
        scores = [d['tangibility_score'] for d in candidates.values() if d['tangibility_score'] > 0]
        edges = [d['edge_count'] for d in candidates.values()]
        print(f"\nSummary:")
        print(f"  Total words: {len(candidates)}")
        print(f"  Words with tangibility > 0: {len(scores)}")
        if scores:
            print(f"  Avg tangibility: {sum(scores)/len(scores):.2f}")
        if edges:
            print(f"  Avg edge count: {sum(edges)/len(edges):.1f}")
            print(f"  Max edge count: {max(edges)}")
            print(f"  Min edge count: {min(edges)}")
        print(f"  Output: {path}")

    finally:
        conn.close()


if __name__ == '__main__':
    main()