Commit 2046b4ab authored by Robyn Speer's avatar Robyn Speer
Browse files

standardize formatting in several files

parent 0c1acadf
......@@ -7,9 +7,7 @@ from conceptnet5.nodes import standardized_concept_uri, ld_node
VECTORS = VectorSpaceWrapper()
FINDER = VECTORS.finder
CONTEXT = [
"http://api.conceptnet.io/ld/conceptnet5.6/context.ld.json",
]
CONTEXT = ["http://api.conceptnet.io/ld/conceptnet5.6/context.ld.json"]
VALID_KEYS = ['rel', 'start', 'end', 'node', 'other', 'source', 'uri']
......@@ -20,10 +18,7 @@ def success(response):
def error(response, status, details):
response['@context'] = CONTEXT
response['error'] = {
'status': status,
'details': details
}
response['error'] = {'status': status, 'details': details}
return response
......@@ -61,8 +56,7 @@ def paginated_url(url, params, offset, limit):
replacing those parameters if they already existed.
"""
new_params = [
(key, val) for (key, val) in params
if key != 'offset' and key != 'limit'
(key, val) for (key, val) in params if key != 'offset' and key != 'limit'
] + [('offset', offset), ('limit', limit)]
return make_query_url(url, new_params)
......@@ -89,7 +83,9 @@ def make_paginated_view(url, params, offset, limit, more):
pager['previousPage'] = paginated_url(url, params, prev_offset, limit)
if more:
pager['nextPage'] = paginated_url(url, params, next_offset, limit)
pager['comment'] = "There are more results. Follow the 'nextPage' link for more."
pager[
'comment'
] = "There are more results. Follow the 'nextPage' link for more."
return pager
......@@ -100,8 +96,7 @@ def lookup_grouped_by_feature(term, filters=None, feature_limit=10):
"""
if not term.startswith('/c/'):
return error(
{}, 400,
'Only concept nodes (starting with /c/) can be grouped by feature.'
{}, 400, 'Only concept nodes (starting with /c/) can be grouped by feature.'
)
found = FINDER.lookup_grouped_by_feature(term, limit=(feature_limit + 1))
......@@ -117,10 +112,12 @@ def lookup_grouped_by_feature(term, filters=None, feature_limit=10):
'weight': sum(assertion['weight'] for assertion in assertions),
'feature': dict(feature_pairs),
'edges': assertions[:feature_limit],
'symmetric': symmetric
'symmetric': symmetric,
}
if len(assertions) > feature_limit:
view = make_paginated_view(base_url, feature_pairs, 0, feature_limit, more=True)
view = make_paginated_view(
base_url, feature_pairs, 0, feature_limit, more=True
)
group['view'] = view
grouped.append(group)
......@@ -131,7 +128,9 @@ def lookup_grouped_by_feature(term, filters=None, feature_limit=10):
response = ld_node(term)
if not grouped and not filters:
return error(response, 404, '%r is not a node in ConceptNet.' % response['label'])
return error(
response, 404, '%r is not a node in ConceptNet.' % response['label']
)
else:
response['features'] = grouped
return success(response)
......@@ -145,15 +144,10 @@ def lookup_paginated(term, limit=50, offset=0):
# Query one more edge than asked for, so we know if there are more
found = FINDER.lookup(term, limit=(limit + 1), offset=offset)
edges = found[:limit]
response = {
'@id': term,
'edges': edges
}
response = {'@id': term, 'edges': edges}
more = len(found) > len(edges)
if len(found) > len(edges) or offset != 0:
response['view'] = make_paginated_view(
term, (), offset, limit, more=more
)
response['view'] = make_paginated_view(term, (), offset, limit, more=more)
if not found:
return error(response, 404, '%r is not a node in ConceptNet.' % term)
else:
......@@ -167,9 +161,7 @@ def lookup_single_assertion(uri):
We return that edge if it exists, and if not, we return a 404 error.
"""
found = FINDER.lookup(uri, limit=1)
response = {
'@id': uri
}
response = {'@id': uri}
if not found:
return error(response, 404, '%r is not an assertion in ConceptNet.' % uri)
else:
......@@ -188,16 +180,13 @@ def query_relatedness(node1, node2):
url = make_query_url('/relatedness', [('node1', node1), ('node2', node2)])
try:
relatedness = VECTORS.get_similarity(node1, node2)
response = {
'@id': url,
'value': round(float(relatedness), 3)
}
response = {'@id': url, 'value': round(float(relatedness), 3)}
return success(response)
except ValueError:
return error(
{'@id': url}, 400,
"Couldn't look up {} or {} (or both).".format(repr(node1),
repr(node2))
{'@id': url},
400,
"Couldn't look up {} or {} (or both).".format(repr(node1), repr(node2)),
)
......@@ -223,14 +212,12 @@ def query_related(uri, filter=None, limit=20):
weight = 1.
query.append(('/c/{}/{}'.format(language, term), weight))
except ValueError:
return error(
{'@id': uri}, 400,
"Couldn't parse this term list: %r" % uri
)
return error({'@id': uri}, 400, "Couldn't parse this term list: %r" % uri)
else:
return error(
{'@id': uri}, 404,
'%r is not something that I can find related terms to.' % uri
{'@id': uri},
404,
'%r is not something that I can find related terms to.' % uri,
)
found = VECTORS.similar_terms(query, filter=filter, limit=limit)
......@@ -238,10 +225,7 @@ def query_related(uri, filter=None, limit=20):
{'@id': key, 'weight': round(float(weight), 3)}
for (key, weight) in found.items()
]
response = {
'@id': uri,
'related': related
}
response = {'@id': uri, 'related': related}
return response
......@@ -254,10 +238,7 @@ def query_paginated(query, offset=0, limit=50):
"""
found = FINDER.query(query, limit=limit + 1, offset=offset)
edges = found[:limit]
response = {
'@id': make_query_url('/query', query.items()),
'edges': edges
}
response = {'@id': make_query_url('/query', query.items()), 'edges': edges}
more = len(found) > len(edges)
if len(found) > len(edges) or offset != 0:
response['view'] = make_paginated_view(
......@@ -271,11 +252,11 @@ def standardize_uri(language, text):
Look up the URI for a given piece of text.
"""
if text is None or language is None:
return error({}, 400, "You should include the 'text' and 'language' parameters.")
return error(
{}, 400, "You should include the 'text' and 'language' parameters."
)
text = text.replace('_', ' ')
uri = standardized_concept_uri(language, text)
response = {
'@id': uri
}
response = {'@id': uri}
return success(response)
......@@ -12,8 +12,14 @@ from wordfreq.preprocess import preprocess_text
from conceptnet5.language.english import english_filter
from conceptnet5.languages import LCODE_ALIASES
from conceptnet5.uri import concept_uri, get_uri_language, is_term, split_uri, uri_prefix, \
uri_to_label
from conceptnet5.uri import (
concept_uri,
get_uri_language,
is_term,
split_uri,
uri_prefix,
uri_to_label,
)
def preprocess_and_tokenize_text(lang, text):
......@@ -71,7 +77,9 @@ def topic_to_concept(language, topic):
if not match:
return standardized_concept_uri(language, topic)
else:
return standardized_concept_uri(language, match.group(1), 'n', 'wp', match.group(2))
return standardized_concept_uri(
language, match.group(1), 'n', 'wp', match.group(2)
)
def standardized_concept_name(lang, text):
......@@ -128,7 +136,7 @@ def standardized_concept_uri(lang, text, *more):
if token_filter is not None:
tokens = token_filter(tokens)
more_text.append('_'.join(tokens))
return concept_uri(lang, norm_text, *more_text)
......@@ -156,7 +164,7 @@ def valid_concept_name(text):
False
"""
tokens = simple_tokenize(text.replace('_', ' '))
return (len(tokens) > 0)
return len(tokens) > 0
def ld_node(uri, label=None):
......@@ -165,10 +173,7 @@ def ld_node(uri, label=None):
"""
if label is None:
label = uri_to_label(uri)
ld = {
'@id': uri,
'label': label
}
ld = {'@id': uri, 'label': label}
if is_term(uri):
pieces = split_uri(uri)
ld['language'] = get_uri_language(uri)
......
......@@ -73,7 +73,7 @@ SYMMETRIC_RELATIONS = {
'/r/EtymologicallyRelatedTo',
'/r/Synonym',
'/r/Antonym',
'/r/DistinctFrom'
'/r/DistinctFrom',
}
......@@ -81,14 +81,16 @@ SYMMETRIC_RELATIONS = {
# having one relation implies that they don't have the opposite relation.
# You could consider these relations themselves to be related by the
# /r/Antonym relation.
OPPOSITE_RELATIONS = _make_symmetric_dict({
'/r/NotDesires': '/r/Desires',
'/r/NotUsedFor': '/r/UsedFor',
'/r/NotCapableOf': '/r/CapableOf',
'/r/NotHasProperty': '/r/HasProperty',
'/r/Antonym': '/r/Synonym',
'/r/ObstructedBy': '/r/HasPrerequisite',
})
OPPOSITE_RELATIONS = _make_symmetric_dict(
{
'/r/NotDesires': '/r/Desires',
'/r/NotUsedFor': '/r/UsedFor',
'/r/NotCapableOf': '/r/CapableOf',
'/r/NotHasProperty': '/r/HasProperty',
'/r/Antonym': '/r/Synonym',
'/r/ObstructedBy': '/r/HasPrerequisite',
}
)
# Most relations can be generalized into less specific relations. They don't
......@@ -103,14 +105,13 @@ OPPOSITE_RELATIONS = _make_symmetric_dict({
# be used either in querying or in learning about relations.
ENTAILED_RELATIONS = {
'/r/Antonym': '/r/DistinctFrom',
'/r/Causes': '/r/RelatedTo',
'/r/CausesDesire': '/r/RelatedTo',
'/r/CapableOf': '/r/RelatedTo',
'/r/CreatedBy': '/r/RelatedTo',
'/r/DerivedFrom': '/r/RelatedTo',
'/r/EtymologicallyRelatedTo': '/r/RelatedTo',
'/r/Entails': '/r/RelatedTo', # can we connect entailment and sub-events?
'/r/Entails': '/r/RelatedTo', # can we connect entailment and sub-events?
'/r/HasContext': '/r/RelatedTo',
'/r/HasProperty': '/r/RelatedTo',
'/r/HasSubevent': '/r/RelatedTo',
......@@ -124,25 +125,17 @@ ENTAILED_RELATIONS = {
'/r/SymbolOf': '/r/RelatedTo',
'/r/UsedFor': '/r/RelatedTo',
'/r/dbpedia': '/r/RelatedTo',
'/r/FormOf': '/r/DerivedFrom',
'/r/HasFirstSubevent': '/r/HasSubevent',
'/r/HasLastSubevent': '/r/HasSubevent',
'/r/HasPrerequisite': '/r/HasSubevent',
'/r/MannerOf': '/r/Entails',
'/r/DefinedAs': '/r/IsA',
'/r/InstanceOf': '/r/IsA',
'/r/AtLocation': '/r/LocatedNear',
'/r/HasA': '/r/LocatedNear',
'/r/PartOf': '/r/AtLocation',
'/r/MadeOf': '/r/HasA',
'/r/Synonym': '/r/SimilarTo',
}
......
......@@ -21,16 +21,16 @@ def standardize_text(text, lowercase=True):
def join_uri(*pieces):
"""
`join_uri` builds a URI from constituent pieces that should be joined
with slashes (/).
`join_uri` builds a URI from constituent pieces that should be joined with
slashes (/).
Leading and trailing on the pieces are acceptable, but will be ignored.
The resulting URI will always begin with a slash and have its pieces
separated by a single slash.
Leading and trailing on the pieces are acceptable, but will be ignored. The
resulting URI will always begin with a slash and have its pieces separated
by a single slash.
The pieces do not have `preprocess_and_tokenize_text` applied to them; to make sure your
URIs are in normal form, run `preprocess_and_tokenize_text` on each piece that represents
arbitrary text.
The pieces do not have `preprocess_and_tokenize_text` applied to them; to
make sure your URIs are in normal form, run `preprocess_and_tokenize_text`
on each piece that represents arbitrary text.
>>> join_uri('/c', 'en', 'cat')
'/c/en/cat'
......@@ -95,8 +95,7 @@ def concept_uri(lang, text, *more):
# probably junk
more = []
for dis1 in more[1:]:
assert ' ' not in dis1,\
"%r is not in normalized form" % dis1
assert ' ' not in dis1, "%r is not in normalized form" % dis1
return join_uri('/c', lang, text, *more)
......@@ -223,8 +222,9 @@ def parse_compound_uri(uri):
if pieces[-1] != ']':
raise ValueError("Compound URIs must end with /]")
if '[' not in pieces:
raise ValueError("Compound URIs must contain /[/ at the beginning of "
"the argument list")
raise ValueError(
"Compound URIs must contain /[/ at the beginning of " "the argument list"
)
list_start = pieces.index('[')
op = join_uri(*pieces[:list_start])
......@@ -233,7 +233,7 @@ def parse_compound_uri(uri):
depth = 0
# Split on commas, but not if they're within additional pairs of brackets.
for piece in pieces[(list_start + 1):-1]:
for piece in pieces[(list_start + 1) : -1]:
if piece == ',' and depth == 0:
chunks.append('/' + ('/'.join(current)).strip('/'))
current = []
......@@ -344,8 +344,10 @@ def is_term(uri):
def is_absolute_url(uri):
"""
We have URLs pointing to Creative Commons licenses, starting with 'cc:', which for Linked
Data purposes are absolute URLs because they'll be resolved into full URLs.
We have URLs pointing to Creative Commons licenses, starting with 'cc:',
which for Linked Data purposes are absolute URLs because they'll be resolved
into full URLs.
>>> is_absolute_url('http://fr.wiktionary.org/wiki/mįkká’e_uxpáðe')
True
>>> is_absolute_url('/c/fr/nouveau')
......@@ -358,6 +360,7 @@ def get_uri_language(uri):
"""
Extract the language from a concept URI. If the URI points to an assertion,
get the language of its first concept.
>>> get_uri_language('/a/[/r/RelatedTo/,/c/en/orchestra/,/c/en/symphony/]')
'en'
>>> get_uri_language('/c/pl/cześć')
......@@ -375,9 +378,10 @@ def get_uri_language(uri):
def uri_to_label(uri):
"""
Convert a ConceptNet uri into a label to be used in nodes. This
function replaces an underscore with a space, so while '/c/en/example' will be converted into
'example', '/c/en/canary_islands' will be converted into 'canary islands'.
Convert a ConceptNet uri into a label to be used in nodes. This function
replaces an underscore with a space, so while '/c/en/example' will be
converted into 'example', '/c/en/canary_islands' will be converted into
'canary islands'.
>>> uri_to_label('/c/en/example')
'example'
......
......@@ -3,21 +3,24 @@ from os import path
from .debias import de_bias_frame
from .evaluation import wordsim, analogy, bias
from .evaluation.compare import (
compare_embeddings, graph_comparison
)
from .evaluation.compare import compare_embeddings, graph_comparison
from .formats import (
convert_glove, convert_word2vec, convert_fasttext, convert_polyglot,
load_hdf, save_hdf, export_text, save_labels, save_npy
convert_glove,
convert_word2vec,
convert_fasttext,
convert_polyglot,
load_hdf,
save_hdf,
export_text,
save_labels,
save_npy,
)
from .merge import merge_intersect
from .miniaturize import miniaturize
from .propagate import sharded_propagate
from .query import VectorSpaceWrapper
from .retrofit import sharded_retrofit, join_shards
from .transforms import (
make_big_frame, make_small_frame
)
from .transforms import make_big_frame, make_small_frame
ANALOGY_FILENAME = 'data/raw/analogy/SAT-package-V3.txt'
......@@ -52,16 +55,28 @@ def filter_word_vectors(dense_hdf_filename, vocab_filename):
@click.option('--verbose', '-v', count=True)
@click.option('--max_cleanup_iters', '-m', default=20)
@click.option('--orig_vec_weight', '-w', default=0.15)
def run_retrofit(dense_hdf_filename, conceptnet_filename, output_filename,
iterations=5, nshards=6, verbose=0, max_cleanup_iters=20,
orig_vec_weight=0.15):
def run_retrofit(
dense_hdf_filename,
conceptnet_filename,
output_filename,
iterations=5,
nshards=6,
verbose=0,
max_cleanup_iters=20,
orig_vec_weight=0.15,
):
"""
Run retrofit, operating on a part of a frame at a time.
"""
sharded_retrofit(
dense_hdf_filename, conceptnet_filename, output_filename,
iterations=iterations, nshards=nshards, verbosity=verbose,
max_cleanup_iters=max_cleanup_iters, orig_vec_weight=orig_vec_weight
dense_hdf_filename,
conceptnet_filename,
output_filename,
iterations=iterations,
nshards=nshards,
verbosity=verbose,
max_cleanup_iters=max_cleanup_iters,
orig_vec_weight=orig_vec_weight,
)
......@@ -78,7 +93,9 @@ def run_convert_glove(glove_filename, output_filename, nrows=500000):
@click.argument('output_filename', type=click.Path(writable=True, dir_okay=False))
@click.option('--nrows', '-n', default=500000)
@click.option('--language', '-l', default='en')
def run_convert_fasttext(fasttext_filename, output_filename, nrows=500000, language='en'):
def run_convert_fasttext(
fasttext_filename, output_filename, nrows=500000, language='en'
):
convert_fasttext(fasttext_filename, output_filename, nrows=nrows, language=language)
......@@ -99,7 +116,9 @@ def run_convert_polyglot(polyglot_filename, output_filename, language):
@cli.command(name='intersect')
@click.argument('input_filenames', nargs=-1, type=click.Path(readable=True, dir_okay=False))
@click.argument(
'input_filenames', nargs=-1, type=click.Path(readable=True, dir_okay=False)
)
@click.argument('output_filename', type=click.Path(writable=True, dir_okay=False))
@click.argument('projection_filename', type=click.Path(writable=True, dir_okay=False))
def run_intersect(input_filenames, output_filename, projection_filename):
......@@ -125,7 +144,9 @@ def run_debias(input_filename, output_filename):
@cli.command(name='evaluate')
@click.argument('filename', type=click.Path(readable=True, dir_okay=False))
@click.option('--subset', '-s', type=click.Choice(['dev', 'test', 'all']), default='dev')
@click.option(
'--subset', '-s', type=click.Choice(['dev', 'test', 'all']), default='dev'
)
@click.option('--semeval-by-language/--semeval-global', '-l', default=False)
@click.option('--run-analogies', is_flag=True)
def run_evaluate(filename, subset, semeval_by_language, run_analogies):
......@@ -145,7 +166,9 @@ def run_evaluate(filename, subset, semeval_by_language, run_analogies):
@cli.command(name='evaluate_wordsim')
@click.argument('filename', type=click.Path(readable=True, dir_okay=False))
@click.option('--subset', '-s', type=click.Choice(['dev', 'test', 'all']), default='dev')
@click.option(
'--subset', '-s', type=click.Choice(['dev', 'test', 'all']), default='dev'
)
@click.option('--semeval-by-language/--semeval-global', '-l', default=False)
def run_evaluate_wordsim(filename, subset, semeval_by_language):
"""
......@@ -161,7 +184,9 @@ def run_evaluate_wordsim(filename, subset, semeval_by_language):
@cli.command(name='evaluate_raw')
@click.argument('filename', type=click.Path(readable=True, dir_okay=False))
@click.option('--subset', '-s', type=click.Choice(['dev', 'test', 'all']), default='dev')
@click.option(
'--subset', '-s', type=click.Choice(['dev', 'test', 'all']), default='dev'
)
@click.option('--semeval-by-language/--semeval-global', '-l', default=False)
def run_evaluate_raw(filename, subset, semeval_by_language):
"""
......@@ -177,7 +202,9 @@ def run_evaluate_raw(filename, subset, semeval_by_language):
@cli.command(name='evaluate_analogies')
@click.argument('filename', type=click.Path(readable=True, dir_okay=False))
@click.option('--subset', '-s', type=click.Choice(['dev', 'test', 'all']), default='dev')
@click.option(
'--subset', '-s', type=click.Choice(['dev', 'test', 'all']), default='dev'
)
def run_evaluate_analogies(filename, subset):
"""
Evaluate a frame on analogy datasets: SAT, Google analogies, Semeval2012-Task2.
......@@ -197,7 +224,9 @@ def run_evaluate_bias(filename):
@cli.command(name='compare_embeddings')
@click.argument('input_filenames', nargs=-1, type=click.Path(readable=True, dir_okay=False))
@click.argument(
'input_filenames', nargs=-1, type=click.Path(readable=True, dir_okay=False)
)
@click.argument('output_filename', type=click.Path(writable=True, dir_okay=False))
@click.option('--run-analogies', is_flag=True)
def run_compare_embeddings(input_filenames, output_filename, run_analogies):
......@@ -212,7 +241,9 @@ def run_compare_embeddings(input_filenames, output_filename, run_analogies):
neighbors in the ConceptNet graph. These embeddings could have been stored
in the matrix, but this saves memory and download time.
"""
results = compare_embeddings(input_filenames, subset='all', run_analogies=run_analogies)
results = compare_embeddings(
input_filenames, subset='all', run_analogies=run_analogies
)
print(results)
save_hdf(results, output_filename)
......@@ -277,18 +308,21 @@ def export_background(input_filename, output_dir, concepts_filename, language):
@cli.command(name='propagate')
@click.argument('assoc_filename',
type=click.Path(readable=True, dir_okay=False))
@click.argument('embedding_filename',
type=click.Path(readable=True, dir_okay=False))
@click.argument('output_filename',
type=click.Path(writable=True, dir_okay=False))
@click.argument('assoc_filename', type=click.Path(readable=True, dir_okay=False))
@click.argument('embedding_filename', type=click.Path(readable=True, dir_okay=False))
@click.argument('output_filename', type=click.Path(writable=True, dir_okay=False))
@click.option('--nshards', '-n', default=6)
@click.option('--iterations', default=20)
def run_propagate(assoc_filename, embedding_filename, output_filename,
nshards=6, iterations=20):
sharded_propagate(assoc_filename, embedding_filename, output_filename,
nshards=nshards, iterations=iterations)
def run_propagate(
assoc_filename, embedding_filename, output_filename, nshards=6, iterations=20
):
sharded_propagate(
assoc_filename,
embedding_filename,
output_filename,
nshards=nshards,
iterations=iterations,
)
@cli.command(name='join_shard_files')
......
......@@ -8,9 +8,11 @@ from conceptnet5.db.query import AssertionFinder
from conceptnet5.uri import get_uri_language, split_uri, uri_prefix
from conceptnet5.util import get_data_filename
from conceptnet5.vectors import (
similar_to_vec, weighted_average, normalize_vec, cosine_similarity,
standardized_uri
similar_to_vec,
weighted_average,
normalize_vec,
cosine_similarity,
standardized_uri,
)
from conceptnet5.vectors.formats import load_hdf
from conceptnet5.vectors.transforms import l2_normalize_rows
......@@ -27,8 +29,10 @@ def field_match(value, query):
"""
Determines whether a given field of an edge (or, in particular, an
assertion) matches the given query.
If the query is a URI, it will match prefixes of longer URIs, unless
`/.` is added to the end of the query.
If the query is a URI, it will match prefixes of longer URIs, unless `/.` is
added to the end of the query.
For example, `/c/en/dog` will match assertions about `/c/en/dog/n/animal`,
but `/c/en/dog/.` will only match assertions about `/c/en/dog`.
"""
......@@ -38,8 +42,9 @@ def field_match(value, query):
elif query.endswith('/.'):
return value == query[:-2]
else:
return (value[:len(query)] == query
and (len(value) == len(query) or value[len(query)] == '/'))
return value[: len(query)] == query and (
len(value) == len(query) or value[len(query)] == '/'
)