Commit 69176749 authored by Robyn Speer's avatar Robyn Speer
Browse files

Add comments / docstrings to db.query

parent 31bc7a01
......@@ -33,28 +33,63 @@ MAX_GROUP_SIZE = 20
def make_list_query(criteria):
"""
Given a dictionary of criteria being searched for, construct the SQL
query for it.
This may require the union of two queries (one for outgoing edges and one
for incoming), and it may require saving certain query criteria for last
because we know they match too many things.
We don't substitute in the actual values of the criteria here -- that's
PostgreSQL's job. We just return a properly parameterized query.
"""
# Look up the given criteria in the cache, and use the cached query if available
crit_tuple = tuple(sorted(criteria))
if crit_tuple in LIST_QUERIES:
return LIST_QUERIES[crit_tuple]
# Start the inner query, finding edges that match the criteria
parts = ["WITH matched_edges AS ("]
# If this is a 'node' query, it happens as the union of two parts: the outgoing
# direction (1) and the incoming direction (-1).
if 'node' in criteria:
piece_directions = [1, -1]
else:
piece_directions = [1]
# Construct the parts of the union
for direction in piece_directions:
# If this is the incoming part (the second one), put the "UNION ALL" keyword
# between the parts that we want to union
if direction == -1:
parts.append("UNION ALL")
parts.append("SELECT e.uri, e.weight, e.data, np1.uri as starturi, np2.uri as enduri")
if direction == 1:
parts.append(", np1.uri as node, np2.uri as other")
else:
parts.append(", np2.uri as node, np1.uri as other")
parts.append("SELECT e.uri, e.weight, e.data")
# If we need to do some after-the-fact filtering, select the URIs of the
# things we're filtering on.
if 'filter_start' in criteria or 'filter_end' in criteria:
parts.append(", np1.uri as start_uri, np2.uri as end_uri")
# If the filter is 'filter_node' or 'filter_other', do the bookkeeping
# for each direction to remember which one was 'node' and which one was
# 'other'.
if 'filter_node' in criteria or 'filter_other' in criteria:
if direction == 1:
parts.append(", np1.uri as node_uri, np2.uri as other_uri")
else:
parts.append(", np2.uri as node_uri, np1.uri as other_uri")
# Name the tables that we need to join. We select from the nodes table
# as 'n1' and 'n2' to find the nodes that actually participate in the
# query, and 'np1' and 'np2' to get node IDs that we match against the
# 'node_prefixes' table.
parts.append("""
FROM relations r, edges e, nodes n1, nodes n2,
node_prefixes p1, node_prefixes p2, nodes np1, nodes np2
""")
if 'source' in criteria:
parts.append(", edge_sources es, sources s")
#
parts.append("""
WHERE e.relation_id=r.id
AND e.start_id=n1.id
......@@ -64,8 +99,11 @@ def make_list_query(criteria):
AND p2.prefix_id=np2.id
AND p2.node_id=n2.id
""")
# Apply the criteria...
if 'source' in criteria:
parts.append("AND s.uri=%(source)s AND es.source_id=s.id AND es.edge_id=e.id")
# But don't apply the criteria that we said to filter later
if 'node' in criteria and 'filter_node' not in criteria:
if direction == 1:
parts.append("AND np1.uri = %(node)s")
......@@ -82,35 +120,53 @@ def make_list_query(criteria):
parts.append("AND np1.uri = %(start)s")
if 'end' in criteria and 'filter_end' not in criteria:
parts.append("AND np2.uri = %(end)s")
# Put a reasonable limit on how many edges this inner query can match.
# This keeps a bound on the runtime but it means that you can't see more
# than 10000 results of a query in total.
parts.append("LIMIT 10000")
parts.append(")")
# That was the inner query. Now extract the information from it, remove
# duplicate results, and apply the filters we saved for later.
parts.append("SELECT DISTINCT ON (weight, uri) uri, data FROM matched_edges")
more_clauses = []
if 'filter_node' in criteria:
more_clauses.append('node LIKE %(filter_node)s')
more_clauses.append('node_uri LIKE %(filter_node)s')
if 'filter_other' in criteria:
more_clauses.append('other LIKE %(filter_other)s')
more_clauses.append('other_uri LIKE %(filter_other)s')
if 'filter_start' in criteria:
more_clauses.append('starturi LIKE %(filter_start)s')
more_clauses.append('start_uri LIKE %(filter_start)s')
if 'filter_end' in criteria:
more_clauses.append('enduri LIKE %(filter_end)s')
more_clauses.append('end_uri LIKE %(filter_end)s')
# We only have a WHERE clause if one of these filters applies
if more_clauses:
parts.append("WHERE " + " AND ".join(more_clauses))
# Sort the results by weight and apply the offset and limit
parts.append("""
ORDER BY weight DESC, uri
OFFSET %(offset)s LIMIT %(limit)s
""")
# Put the parts together into one query string
query = '\n'.join(parts)
# Cache the query string
LIST_QUERIES[crit_tuple] = query
return query
class AssertionFinder(object):
"""
The object that interacts with the database to find ConcetNet assertions
(edges) matching certain criteria.
"""
def __init__(self, dbname=None):
self.connection = None
self.dbname = dbname
def lookup(self, uri, limit=100, offset=0):
"""
A query that returns all the edges that include a certain URI.
"""
if self.connection is None:
self.connection = get_db_connection(self.dbname)
if uri.startswith('/c/') or uri.startswith('http'):
......@@ -128,6 +184,13 @@ class AssertionFinder(object):
return self.query(criteria, limit, offset)
def lookup_grouped_by_feature(self, uri, limit=20):
"""
The query used by the browseable interface, which groups its results
by what 'feature' they describe of the queried node.
A feature is defined by the relation, the queried node, and the direction
(incoming or outgoing).
"""
uri = remove_control_chars(uri)
if self.connection is None:
self.connection = get_db_connection(self.dbname)
......@@ -157,6 +220,9 @@ class AssertionFinder(object):
return results
def lookup_assertion(self, uri):
"""
Get a single assertion, given its URI starting with /a/.
"""
# Sanitize URIs to remove control characters such as \x00. The postgres driver would
# remove \x00 anyway, but this avoids reporting a server error when that happens.
uri = remove_control_chars(uri)
......@@ -168,6 +234,9 @@ class AssertionFinder(object):
return results
def sample_dataset(self, uri, limit=50, offset=0):
"""
Get a subsample of edges matching a particular dataset.
"""
uri = remove_control_chars(uri)
if self.connection is None:
self.connection = get_db_connection(self.dbname)
......@@ -178,6 +247,9 @@ class AssertionFinder(object):
return results
def random_edges(self, limit=20):
"""
Get a collection of distinct, randomly-selected edges.
"""
if self.connection is None:
self.connection = get_db_connection(self.dbname)
cursor = self.connection.cursor()
......@@ -186,6 +258,9 @@ class AssertionFinder(object):
return results
def query(self, criteria, limit=20, offset=0):
"""
The most general way to query based on a set of criteria.
"""
criteria = criteria.copy()
if self.connection is None:
self.connection = get_db_connection(self.dbname)
......@@ -202,7 +277,6 @@ class AssertionFinder(object):
params['offset'] = offset
cursor = self.connection.cursor()
print(query_string, params)
cursor.execute(query_string, params)
results = [
transform_for_linked_data(data) for uri, data in cursor.fetchall()
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment