103 lines
3.6 KiB
Python
103 lines
3.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Extract ConceptNet relationships between words in the folksy vocabulary.
|
|
Reads folksy_vocab.csv, queries PostgreSQL conceptnet5 database, and writes
|
|
folksy_relations.csv with columns: start_word, end_word, relation, weight, surface_text
|
|
"""
|
|
|
|
import csv
|
|
import psycopg2
|
|
|
|
INPUT_PATH = "/home/john/Development/folksy-generator/data/folksy_vocab.csv"
|
|
OUTPUT_PATH = "/home/john/Development/folksy-generator/data/folksy_relations.csv"
|
|
|
|
RELATION_TYPES = [
|
|
"UsedFor", "AtLocation", "CapableOf", "HasA", "PartOf", "Causes",
|
|
"CausesDesire", "HasPrerequisite", "ReceivesAction", "Desires",
|
|
"LocatedNear", "CreatedBy", "MadeOf", "HasProperty", "MotivatedByGoal",
|
|
"HasSubevent",
|
|
]
|
|
|
|
|
|
def main():
|
|
# Step 1: Read the word list from folksy_vocab.csv
|
|
words = []
|
|
with open(INPUT_PATH, "r", newline="") as f:
|
|
reader = csv.DictReader(f)
|
|
for row in reader:
|
|
words.append(row["word"].strip())
|
|
|
|
print(f"Read {len(words)} words from {INPUT_PATH}")
|
|
|
|
# Build node URIs
|
|
word_uris = [f"/c/en/{w}" for w in words]
|
|
|
|
# Build relation URIs
|
|
relation_uris = [f"/r/{r}" for r in RELATION_TYPES]
|
|
|
|
conn = psycopg2.connect(dbname="conceptnet5")
|
|
cur = conn.cursor()
|
|
|
|
# Step 2: Look up all node IDs for these words
|
|
cur.execute("SELECT id, uri FROM nodes WHERE uri = ANY(%s)", (word_uris,))
|
|
node_rows = cur.fetchall()
|
|
uri_to_id = {uri: nid for nid, uri in node_rows}
|
|
id_to_uri = {nid: uri for nid, uri in node_rows}
|
|
|
|
found_words = [uri.replace("/c/en/", "") for uri in uri_to_id]
|
|
missing_words = set(words) - set(found_words)
|
|
print(f"Found {len(uri_to_id)} node IDs out of {len(words)} words")
|
|
if missing_words:
|
|
print(f"Missing {len(missing_words)} words: {sorted(missing_words)[:20]}...")
|
|
|
|
node_ids = list(uri_to_id.values())
|
|
|
|
# Step 3: Look up relation IDs
|
|
cur.execute("SELECT id, uri FROM relations WHERE uri = ANY(%s)", (relation_uris,))
|
|
rel_rows = cur.fetchall()
|
|
rel_id_to_name = {rid: uri.replace("/r/", "") for rid, uri in rel_rows}
|
|
rel_ids = list(rel_id_to_name.keys())
|
|
|
|
print(f"Found {len(rel_ids)} relation types: {sorted(rel_id_to_name.values())}")
|
|
|
|
# Step 4: Query edges where both start and end are in our folksy node set,
|
|
# relation is one of our types, and weight >= 1.0
|
|
cur.execute(
|
|
"""
|
|
SELECT e.start_id, e.end_id, e.relation_id, e.weight, e.data->>'surfaceText'
|
|
FROM edges e
|
|
WHERE e.start_id = ANY(%s)
|
|
AND e.end_id = ANY(%s)
|
|
AND e.relation_id = ANY(%s)
|
|
AND e.weight >= 1.0
|
|
ORDER BY e.weight DESC
|
|
""",
|
|
(node_ids, node_ids, rel_ids),
|
|
)
|
|
|
|
rows = cur.fetchall()
|
|
print(f"Found {len(rows)} edges")
|
|
|
|
cur.close()
|
|
conn.close()
|
|
|
|
# Step 5: Convert node IDs back to word strings and write CSV
|
|
results = []
|
|
for start_id, end_id, relation_id, weight, surface_text in rows:
|
|
start_word = id_to_uri[start_id].replace("/c/en/", "")
|
|
end_word = id_to_uri[end_id].replace("/c/en/", "")
|
|
relation = rel_id_to_name[relation_id]
|
|
results.append((start_word, end_word, relation, weight, surface_text or ""))
|
|
|
|
# Step 6: Write output CSV sorted by weight descending (already sorted by query)
|
|
with open(OUTPUT_PATH, "w", newline="") as f:
|
|
writer = csv.writer(f, quoting=csv.QUOTE_ALL)
|
|
writer.writerow(["start_word", "end_word", "relation", "weight", "surface_text"])
|
|
for row in results:
|
|
writer.writerow(row)
|
|
|
|
print(f"Wrote {len(results)} relationships to {OUTPUT_PATH}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|