Initial 'folksy idiom' generator

2026-02-15 14:04:25 -05:00 · 2026-02-15 14:04:25 -05:00 · 8c8a058301
commit 8c8a058301
11 changed files with 14485 additions and 0 deletions
--- a/scripts/extract_relations.py
+++ b/scripts/extract_relations.py
@ -0,0 +1,103 @@
+#!/usr/bin/env python3
+"""
+Extract ConceptNet relationships between words in the folksy vocabulary.
+Reads folksy_vocab.csv, queries PostgreSQL conceptnet5 database, and writes
+folksy_relations.csv with columns: start_word, end_word, relation, weight, surface_text
+"""
+
+import csv
+import psycopg2
+
+INPUT_PATH = "/home/john/Development/folksy-generator/data/folksy_vocab.csv"
+OUTPUT_PATH = "/home/john/Development/folksy-generator/data/folksy_relations.csv"
+
+RELATION_TYPES = [
+    "UsedFor", "AtLocation", "CapableOf", "HasA", "PartOf", "Causes",
+    "CausesDesire", "HasPrerequisite", "ReceivesAction", "Desires",
+    "LocatedNear", "CreatedBy", "MadeOf", "HasProperty", "MotivatedByGoal",
+    "HasSubevent",
+]
+
+
+def main():
+    # Step 1: Read the word list from folksy_vocab.csv
+    words = []
+    with open(INPUT_PATH, "r", newline="") as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            words.append(row["word"].strip())
+
+    print(f"Read {len(words)} words from {INPUT_PATH}")
+
+    # Build node URIs
+    word_uris = [f"/c/en/{w}" for w in words]
+
+    # Build relation URIs
+    relation_uris = [f"/r/{r}" for r in RELATION_TYPES]
+
+    conn = psycopg2.connect(dbname="conceptnet5")
+    cur = conn.cursor()
+
+    # Step 2: Look up all node IDs for these words
+    cur.execute("SELECT id, uri FROM nodes WHERE uri = ANY(%s)", (word_uris,))
+    node_rows = cur.fetchall()
+    uri_to_id = {uri: nid for nid, uri in node_rows}
+    id_to_uri = {nid: uri for nid, uri in node_rows}
+
+    found_words = [uri.replace("/c/en/", "") for uri in uri_to_id]
+    missing_words = set(words) - set(found_words)
+    print(f"Found {len(uri_to_id)} node IDs out of {len(words)} words")
+    if missing_words:
+        print(f"Missing {len(missing_words)} words: {sorted(missing_words)[:20]}...")
+
+    node_ids = list(uri_to_id.values())
+
+    # Step 3: Look up relation IDs
+    cur.execute("SELECT id, uri FROM relations WHERE uri = ANY(%s)", (relation_uris,))
+    rel_rows = cur.fetchall()
+    rel_id_to_name = {rid: uri.replace("/r/", "") for rid, uri in rel_rows}
+    rel_ids = list(rel_id_to_name.keys())
+
+    print(f"Found {len(rel_ids)} relation types: {sorted(rel_id_to_name.values())}")
+
+    # Step 4: Query edges where both start and end are in our folksy node set,
+    # relation is one of our types, and weight >= 1.0
+    cur.execute(
+        """
+        SELECT e.start_id, e.end_id, e.relation_id, e.weight, e.data->>'surfaceText'
+        FROM edges e
+        WHERE e.start_id = ANY(%s)
+          AND e.end_id = ANY(%s)
+          AND e.relation_id = ANY(%s)
+          AND e.weight >= 1.0
+        ORDER BY e.weight DESC
+        """,
+        (node_ids, node_ids, rel_ids),
+    )
+
+    rows = cur.fetchall()
+    print(f"Found {len(rows)} edges")
+
+    cur.close()
+    conn.close()
+
+    # Step 5: Convert node IDs back to word strings and write CSV
+    results = []
+    for start_id, end_id, relation_id, weight, surface_text in rows:
+        start_word = id_to_uri[start_id].replace("/c/en/", "")
+        end_word = id_to_uri[end_id].replace("/c/en/", "")
+        relation = rel_id_to_name[relation_id]
+        results.append((start_word, end_word, relation, weight, surface_text or ""))
+
+    # Step 6: Write output CSV sorted by weight descending (already sorted by query)
+    with open(OUTPUT_PATH, "w", newline="") as f:
+        writer = csv.writer(f, quoting=csv.QUOTE_ALL)
+        writer.writerow(["start_word", "end_word", "relation", "weight", "surface_text"])
+        for row in results:
+            writer.writerow(row)
+
+    print(f"Wrote {len(results)} relationships to {OUTPUT_PATH}")
+
+
+if __name__ == "__main__":
+    main()