folksy_idioms/scripts/extract_relations.py

#!/usr/bin/env python3
"""
Extract ConceptNet relationships between words in the folksy vocabulary.
Reads folksy_vocab.csv, queries PostgreSQL conceptnet5 database, and writes
folksy_relations.csv with columns: start_word, end_word, relation, weight, surface_text
"""

import csv
import psycopg2

INPUT_PATH = "/home/john/Development/folksy-generator/data/folksy_vocab.csv"
OUTPUT_PATH = "/home/john/Development/folksy-generator/data/folksy_relations.csv"

RELATION_TYPES = [
    "UsedFor", "AtLocation", "CapableOf", "HasA", "PartOf", "Causes",
    "CausesDesire", "HasPrerequisite", "ReceivesAction", "Desires",
    "LocatedNear", "CreatedBy", "MadeOf", "HasProperty", "MotivatedByGoal",
    "HasSubevent",
]


def main():
    # Step 1: Read the word list from folksy_vocab.csv
    words = []
    with open(INPUT_PATH, "r", newline="") as f:
        reader = csv.DictReader(f)
        for row in reader:
            words.append(row["word"].strip())

    print(f"Read {len(words)} words from {INPUT_PATH}")

    # Build node URIs
    word_uris = [f"/c/en/{w}" for w in words]

    # Build relation URIs
    relation_uris = [f"/r/{r}" for r in RELATION_TYPES]

    conn = psycopg2.connect(dbname="conceptnet5")
    cur = conn.cursor()

    # Step 2: Look up all node IDs for these words
    cur.execute("SELECT id, uri FROM nodes WHERE uri = ANY(%s)", (word_uris,))
    node_rows = cur.fetchall()
    uri_to_id = {uri: nid for nid, uri in node_rows}
    id_to_uri = {nid: uri for nid, uri in node_rows}

    found_words = [uri.replace("/c/en/", "") for uri in uri_to_id]
    missing_words = set(words) - set(found_words)
    print(f"Found {len(uri_to_id)} node IDs out of {len(words)} words")
    if missing_words:
        print(f"Missing {len(missing_words)} words: {sorted(missing_words)[:20]}...")

    node_ids = list(uri_to_id.values())

    # Step 3: Look up relation IDs
    cur.execute("SELECT id, uri FROM relations WHERE uri = ANY(%s)", (relation_uris,))
    rel_rows = cur.fetchall()
    rel_id_to_name = {rid: uri.replace("/r/", "") for rid, uri in rel_rows}
    rel_ids = list(rel_id_to_name.keys())

    print(f"Found {len(rel_ids)} relation types: {sorted(rel_id_to_name.values())}")

    # Step 4: Query edges where both start and end are in our folksy node set,
    # relation is one of our types, and weight >= 1.0
    cur.execute(
        """
        SELECT e.start_id, e.end_id, e.relation_id, e.weight, e.data->>'surfaceText'
        FROM edges e
        WHERE e.start_id = ANY(%s)
          AND e.end_id = ANY(%s)
          AND e.relation_id = ANY(%s)
          AND e.weight >= 1.0
        ORDER BY e.weight DESC
        """,
        (node_ids, node_ids, rel_ids),
    )

    rows = cur.fetchall()
    print(f"Found {len(rows)} edges")

    cur.close()
    conn.close()

    # Step 5: Convert node IDs back to word strings and write CSV
    results = []
    for start_id, end_id, relation_id, weight, surface_text in rows:
        start_word = id_to_uri[start_id].replace("/c/en/", "")
        end_word = id_to_uri[end_id].replace("/c/en/", "")
        relation = rel_id_to_name[relation_id]
        results.append((start_word, end_word, relation, weight, surface_text or ""))

    # Step 6: Write output CSV sorted by weight descending (already sorted by query)
    with open(OUTPUT_PATH, "w", newline="") as f:
        writer = csv.writer(f, quoting=csv.QUOTE_ALL)
        writer.writerow(["start_word", "end_word", "relation", "weight", "surface_text"])
        for row in results:
            writer.writerow(row)

    print(f"Wrote {len(results)} relationships to {OUTPUT_PATH}")


if __name__ == "__main__":
    main()