Commit 92c78930 authored by jlowryduda's avatar jlowryduda
Browse files

Merge branch 'master' into add-kyoto-yahoo-data

parents b610b058 028b1b39
from .connection import get_db_connection
from conceptnet5.db.connection import get_db_connection
from conceptnet5.edges import transform_for_linked_data
import json
import itertools
......@@ -13,6 +13,9 @@ RANDOM_NODES_QUERY = "SELECT * FROM nodes TABLESAMPLE SYSTEM(1) WHERE uri LIKE :
DATASET_QUERY = "SELECT uri, data FROM edges TABLESAMPLE SYSTEM(0.01) WHERE data->'dataset' = %(dataset)s ORDER BY weight DESC OFFSET %(offset)s LIMIT %(limit)s"
TOO_BIG_PREFIXES = ['/c/en', '/c/fr', '/c/es', '/c/de', '/c/ja', '/c/zh',
'/c/pt', '/c/la', '/c/it', '/c/ru' ,'/c/fi']
NODE_TO_FEATURE_QUERY = """
WITH node_ids AS (
SELECT p.node_id FROM nodes n, node_prefixes p
......@@ -31,24 +34,62 @@ MAX_GROUP_SIZE = 20
def make_list_query(criteria):
"""
Given a dictionary of criteria being searched for, construct the SQL
query for it.
This may require the union of two queries (one for outgoing edges and one
for incoming), and it may require saving certain query criteria for last
because we know they match too many things.
We don't substitute in the actual values of the criteria here -- that's
PostgreSQL's job. We just return a properly parameterized query.
"""
# Look up the given criteria in the cache, and use the cached query if available
crit_tuple = tuple(sorted(criteria))
if crit_tuple in LIST_QUERIES:
return LIST_QUERIES[crit_tuple]
# Start the inner query, finding edges that match the criteria
parts = ["WITH matched_edges AS ("]
if 'node' in criteria:
# If this is a 'node' query, it happens as the union of two parts: the outgoing
# direction (1) and the incoming direction (-1).
if 'node' in criteria or 'filter_node' in criteria:
piece_directions = [1, -1]
else:
piece_directions = [1]
# Construct the parts of the union
for direction in piece_directions:
# If this is the incoming part (the second one), put the "UNION ALL" keyword
# between the parts that we want to union
if direction == -1:
parts.append("UNION ALL")
parts.append("SELECT e.uri, e.weight, e.data")
# If we need to do some after-the-fact filtering, select the URIs of the
# things we're filtering on.
if 'filter_start' in criteria or 'filter_end' in criteria:
parts.append(", np1.uri as start_uri, np2.uri as end_uri")
# If the filter is 'filter_node' or 'filter_other', do the bookkeeping
# for each direction to remember which one was 'node' and which one was
# 'other'.
if 'filter_node' in criteria or 'filter_other' in criteria:
if direction == 1:
parts.append(", np1.uri as node_uri, np2.uri as other_uri")
else:
parts.append(", np2.uri as node_uri, np1.uri as other_uri")
# Name the tables that we need to join. We select from the nodes table
# as 'n1' and 'n2' to find the nodes that actually participate in the
# query, and 'np1' and 'np2' to get node IDs that we match against the
# 'node_prefixes' table.
parts.append("""
SELECT e.uri, e.weight, e.data
FROM relations r, edges e, nodes n1, nodes n2,
node_prefixes p1, node_prefixes p2, nodes np1, nodes np2
""")
if 'source' in criteria:
parts.append(", edge_sources es, sources s")
parts.append("""
WHERE e.relation_id=r.id
AND e.start_id=n1.id
......@@ -58,42 +99,74 @@ def make_list_query(criteria):
AND p2.prefix_id=np2.id
AND p2.node_id=n2.id
""")
# Apply the criteria...
if 'source' in criteria:
parts.append("AND s.uri=%(source)s AND es.source_id=s.id AND es.edge_id=e.id")
if 'node' in criteria:
# But don't apply the criteria that we said to filter later
if 'node' in criteria and 'filter_node' not in criteria:
if direction == 1:
parts.append("AND np1.uri = %(node)s")
else:
parts.append("AND np2.uri = %(node)s")
if 'other' in criteria:
if 'other' in criteria and 'filter_other' not in criteria:
if direction == 1:
parts.append("AND np2.uri = %(other)s")
else:
parts.append("AND np1.uri = %(other)s")
if 'rel' in criteria:
parts.append("AND r.uri = %(rel)s")
if 'start' in criteria:
if 'start' in criteria and 'filter_start' not in criteria:
parts.append("AND np1.uri = %(start)s")
if 'end' in criteria:
if 'end' in criteria and 'filter_end' not in criteria:
parts.append("AND np2.uri = %(end)s")
# Put a reasonable limit on how many edges this inner query can match.
# This keeps a bound on the runtime but it means that you can't see more
# than 10000 results of a query in total.
parts.append("LIMIT 10000")
parts.append(")")
# That was the inner query. Now extract the information from it, remove
# duplicate results, and apply the filters we saved for later.
parts.append("SELECT DISTINCT ON (weight, uri) uri, data FROM matched_edges")
more_clauses = []
if 'filter_node' in criteria:
more_clauses.append('node_uri LIKE %(filter_node)s')
if 'filter_other' in criteria:
more_clauses.append('other_uri LIKE %(filter_other)s')
if 'filter_start' in criteria:
more_clauses.append('start_uri LIKE %(filter_start)s')
if 'filter_end' in criteria:
more_clauses.append('end_uri LIKE %(filter_end)s')
# We only have a WHERE clause if one of these filters applies
if more_clauses:
parts.append("WHERE " + " AND ".join(more_clauses))
# Sort the results by weight and apply the offset and limit
parts.append("""
SELECT DISTINCT ON (weight, uri) uri, data FROM matched_edges
ORDER BY weight DESC, uri
OFFSET %(offset)s LIMIT %(limit)s
""")
# Put the parts together into one query string
query = '\n'.join(parts)
# Cache the query string
LIST_QUERIES[crit_tuple] = query
return query
class AssertionFinder(object):
"""
The object that interacts with the database to find ConcetNet assertions
(edges) matching certain criteria.
"""
def __init__(self, dbname=None):
self.connection = None
self.dbname = dbname
def lookup(self, uri, limit=100, offset=0):
"""
A query that returns all the edges that include a certain URI.
"""
if self.connection is None:
self.connection = get_db_connection(self.dbname)
if uri.startswith('/c/') or uri.startswith('http'):
......@@ -111,6 +184,13 @@ class AssertionFinder(object):
return self.query(criteria, limit, offset)
def lookup_grouped_by_feature(self, uri, limit=20):
"""
The query used by the browseable interface, which groups its results
by what 'feature' they describe of the queried node.
A feature is defined by the relation, the queried node, and the direction
(incoming or outgoing).
"""
uri = remove_control_chars(uri)
if self.connection is None:
self.connection = get_db_connection(self.dbname)
......@@ -140,6 +220,9 @@ class AssertionFinder(object):
return results
def lookup_assertion(self, uri):
"""
Get a single assertion, given its URI starting with /a/.
"""
# Sanitize URIs to remove control characters such as \x00. The postgres driver would
# remove \x00 anyway, but this avoids reporting a server error when that happens.
uri = remove_control_chars(uri)
......@@ -151,6 +234,9 @@ class AssertionFinder(object):
return results
def sample_dataset(self, uri, limit=50, offset=0):
"""
Get a subsample of edges matching a particular dataset.
"""
uri = remove_control_chars(uri)
if self.connection is None:
self.connection = get_db_connection(self.dbname)
......@@ -161,6 +247,9 @@ class AssertionFinder(object):
return results
def random_edges(self, limit=20):
"""
Get a collection of distinct, randomly-selected edges.
"""
if self.connection is None:
self.connection = get_db_connection(self.dbname)
cursor = self.connection.cursor()
......@@ -169,16 +258,27 @@ class AssertionFinder(object):
return results
def query(self, criteria, limit=20, offset=0):
"""
The most general way to query based on a set of criteria.
"""
criteria = criteria.copy()
if self.connection is None:
self.connection = get_db_connection(self.dbname)
for criterion in ['node', 'other', 'start', 'end']:
if criterion in criteria and criteria[criterion] in TOO_BIG_PREFIXES:
criteria['filter_' + criterion] = criteria[criterion] + '%'
query_string = make_list_query(criteria)
params = {
key: remove_control_chars(value)
for (key, value) in criteria.items()
}
params['limit'] = limit
params['offset'] = offset
query_string = make_list_query(criteria)
cursor = self.connection.cursor()
cursor.execute(query_string, params)
results = [transform_for_linked_data(data) for uri, data in cursor.fetchall()]
results = [
transform_for_linked_data(data) for uri, data in cursor.fetchall()
]
return results
import re
from nose.tools import eq_, assert_raises
from conceptnet5.db.query import make_list_query
EXTRA_WHITESPACE_RE = re.compile(r"\s+")
SYMBOL_WHITESPACE_RE = re.compile(r"\s(?=\W)")
def strip_whitespace(query):
"""
Convert SQL queries to a canonical form that disregards most whitespace
differences.
"""
# Replace whitespace sequences with a single space
query = EXTRA_WHITESPACE_RE.sub(" ", query)
# Remove whitespace before non-word symbols
query = SYMBOL_WHITESPACE_RE.sub("", query)
# Remove surrounding whitespace
return query.strip()
def whitespace_eq(string1, string2):
eq_(strip_whitespace(string1), strip_whitespace(string2))
def test_whitespace():
# Test the test helper functions, so we know they're testing the right thing.
eq_(strip_whitespace("\tline 1\n\tline 2\n"), "line 1 line 2")
eq_(strip_whitespace(" text , with spaces "), "text, with spaces")
whitespace_eq(" text , with spaces ", "text,\nwith\nspaces")
with assert_raises(AssertionError):
whitespace_eq(" text , with spaces ", "text\n,with\nspaces")
# The following tests try various different query shapes and make sure they
# return the expected query strings.
#
# The values of the query criteria don't matter. They're only substituted in
# when the query is actually sent to the database, using the proper substitution
# mechanism that avoids SQL injection.
def test_query_start():
query = make_list_query({'start': 'x'})
expected = """
WITH matched_edges AS (
SELECT e.uri, e.weight, e.data
FROM relations r, edges e, nodes n1, nodes n2,
node_prefixes p1, node_prefixes p2, nodes np1, nodes np2
WHERE e.relation_id=r.id
AND e.start_id=n1.id
AND e.end_id=n2.id
AND p1.prefix_id=np1.id
AND p1.node_id=n1.id
AND p2.prefix_id=np2.id
AND p2.node_id=n2.id
AND np1.uri = %(start)s
LIMIT 10000
)
SELECT DISTINCT ON (weight, uri) uri, data FROM matched_edges
ORDER BY weight DESC, uri
OFFSET %(offset)s LIMIT %(limit)s
"""
whitespace_eq(query, expected)
def test_query_source():
query = make_list_query({'source': 'x'})
expected = """
WITH matched_edges AS (
SELECT e.uri, e.weight, e.data
FROM relations r, edges e, nodes n1, nodes n2,
node_prefixes p1, node_prefixes p2, nodes np1, nodes np2,
edge_sources es, sources s
WHERE e.relation_id=r.id
AND e.start_id=n1.id
AND e.end_id=n2.id
AND p1.prefix_id=np1.id
AND p1.node_id=n1.id
AND p2.prefix_id=np2.id
AND p2.node_id=n2.id
AND s.uri=%(source)s AND es.source_id=s.id AND es.edge_id=e.id
LIMIT 10000
)
SELECT DISTINCT ON (weight, uri) uri, data FROM matched_edges
ORDER BY weight DESC, uri
OFFSET %(offset)s LIMIT %(limit)s
"""
whitespace_eq(query, expected)
def test_query_end_source():
query = make_list_query({'end': 'x', 'source': 'x'})
expected = """
WITH matched_edges AS (
SELECT e.uri, e.weight, e.data
FROM relations r, edges e, nodes n1, nodes n2,
node_prefixes p1, node_prefixes p2, nodes np1, nodes np2,
edge_sources es, sources s
WHERE e.relation_id=r.id
AND e.start_id=n1.id
AND e.end_id=n2.id
AND p1.prefix_id=np1.id
AND p1.node_id=n1.id
AND p2.prefix_id=np2.id
AND p2.node_id=n2.id
AND s.uri=%(source)s AND es.source_id=s.id AND es.edge_id=e.id
AND np2.uri = %(end)s
LIMIT 10000
)
SELECT DISTINCT ON (weight, uri) uri, data FROM matched_edges
ORDER BY weight DESC, uri
OFFSET %(offset)s LIMIT %(limit)s
"""
whitespace_eq(query, expected)
def test_query_node():
query = make_list_query({'node': 'x'})
expected = """
WITH matched_edges AS (
SELECT e.uri, e.weight, e.data
FROM relations r, edges e, nodes n1, nodes n2,
node_prefixes p1, node_prefixes p2, nodes np1, nodes np2
WHERE e.relation_id=r.id
AND e.start_id=n1.id
AND e.end_id=n2.id
AND p1.prefix_id=np1.id
AND p1.node_id=n1.id
AND p2.prefix_id=np2.id
AND p2.node_id=n2.id
AND np1.uri = %(node)s
UNION ALL
SELECT e.uri, e.weight, e.data
FROM relations r, edges e, nodes n1, nodes n2,
node_prefixes p1, node_prefixes p2, nodes np1, nodes np2
WHERE e.relation_id=r.id
AND e.start_id=n1.id
AND e.end_id=n2.id
AND p1.prefix_id=np1.id
AND p1.node_id=n1.id
AND p2.prefix_id=np2.id
AND p2.node_id=n2.id
AND np2.uri = %(node)s
LIMIT 10000
)
SELECT DISTINCT ON (weight, uri) uri, data FROM matched_edges
ORDER BY weight DESC, uri
OFFSET %(offset)s LIMIT %(limit)s
"""
whitespace_eq(query, expected)
def test_query_filter_node():
query = make_list_query({'filter_node': 'x'})
expected = """
WITH matched_edges AS (
SELECT e.uri, e.weight, e.data, np1.uri as node_uri, np2.uri as other_uri
FROM relations r, edges e, nodes n1, nodes n2,
node_prefixes p1, node_prefixes p2, nodes np1, nodes np2
WHERE e.relation_id=r.id
AND e.start_id=n1.id
AND e.end_id=n2.id
AND p1.prefix_id=np1.id
AND p1.node_id=n1.id
AND p2.prefix_id=np2.id
AND p2.node_id=n2.id
UNION ALL
SELECT e.uri, e.weight, e.data, np2.uri as node_uri, np1.uri as other_uri
FROM relations r, edges e, nodes n1, nodes n2,
node_prefixes p1, node_prefixes p2, nodes np1, nodes np2
WHERE e.relation_id=r.id
AND e.start_id=n1.id
AND e.end_id=n2.id
AND p1.prefix_id=np1.id
AND p1.node_id=n1.id
AND p2.prefix_id=np2.id
AND p2.node_id=n2.id
LIMIT 10000
)
SELECT DISTINCT ON (weight, uri) uri, data FROM matched_edges
WHERE node_uri LIKE %(filter_node)s
ORDER BY weight DESC, uri
OFFSET %(offset)s LIMIT %(limit)s
"""
whitespace_eq(query, expected)
def test_query_filter_node_rel():
query = make_list_query({'filter_node': 'x', 'rel': 'x'})
expected = """
WITH matched_edges AS (
SELECT e.uri, e.weight, e.data, np1.uri as node_uri, np2.uri as other_uri
FROM relations r, edges e, nodes n1, nodes n2,
node_prefixes p1, node_prefixes p2, nodes np1, nodes np2
WHERE e.relation_id=r.id
AND e.start_id=n1.id
AND e.end_id=n2.id
AND p1.prefix_id=np1.id
AND p1.node_id=n1.id
AND p2.prefix_id=np2.id
AND p2.node_id=n2.id
AND r.uri = %(rel)s
UNION ALL
SELECT e.uri, e.weight, e.data, np2.uri as node_uri, np1.uri as other_uri
FROM relations r, edges e, nodes n1, nodes n2,
node_prefixes p1, node_prefixes p2, nodes np1, nodes np2
WHERE e.relation_id=r.id
AND e.start_id=n1.id
AND e.end_id=n2.id
AND p1.prefix_id=np1.id
AND p1.node_id=n1.id
AND p2.prefix_id=np2.id
AND p2.node_id=n2.id
AND r.uri = %(rel)s
LIMIT 10000
)
SELECT DISTINCT ON (weight, uri) uri, data FROM matched_edges
WHERE node_uri LIKE %(filter_node)s
ORDER BY weight DESC, uri
OFFSET %(offset)s LIMIT %(limit)s
"""
whitespace_eq(query, expected)
def test_query_filter_start_source():
query = make_list_query({'filter_start': 'x', 'source': 'x'})
expected = """
WITH matched_edges AS (
SELECT e.uri, e.weight, e.data, np1.uri as start_uri, np2.uri as end_uri
FROM relations r, edges e, nodes n1, nodes n2,
node_prefixes p1, node_prefixes p2, nodes np1, nodes np2,
edge_sources es, sources s
WHERE e.relation_id=r.id
AND e.start_id=n1.id
AND e.end_id=n2.id
AND p1.prefix_id=np1.id
AND p1.node_id=n1.id
AND p2.prefix_id=np2.id
AND p2.node_id=n2.id
AND s.uri=%(source)s AND es.source_id=s.id AND es.edge_id=e.id
LIMIT 10000
)
SELECT DISTINCT ON (weight, uri) uri, data FROM matched_edges
WHERE start_uri LIKE %(filter_start)s
ORDER BY weight DESC, uri
OFFSET %(offset)s LIMIT %(limit)s
"""
whitespace_eq(query, expected)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment