Unverified Commit 2ec9fbff authored by jlowryduda's avatar jlowryduda Committed by GitHub
Browse files

Merge pull request #266 from commonsense/fix-malformed-related-query

Fix malformed /related query
parents 8f04c723 1fbb9730
......@@ -74,6 +74,12 @@ PRECOMPUTED_DATA_PATH = "/precomputed-data/2016"
PRECOMPUTED_DATA_URL = "https://conceptnet.s3.amazonaws.com" + PRECOMPUTED_DATA_PATH
PRECOMPUTED_S3_UPLOAD = "s3://conceptnet" + PRECOMPUTED_DATA_PATH
# We need an external vocabulary to refer to when miniaturizing our set of
# embeddings. In the normal case, this is the word2vec Google News vocabulary.
# In the test build, we don't have that, we only have a cut-down version of GloVe,
# so we'll switch it to that instead.
MINI_VOCAB_SOURCE = "/vectors/w2v-google-news.h5"
INPUT_EMBEDDINGS = [
'crawl-300d-2M', 'w2v-google-news', 'glove12-840B', 'fasttext-opensubtitles'
]
......@@ -96,6 +102,7 @@ if TESTMODE:
RAW_DATA_URL = "/missing/data"
PRECOMPUTED_DATA_URL = "/missing/data"
EMOJI_LANGUAGES = ['en', 'en_001']
MINI_VOCAB_SOURCE = "/vectors/glove12-840B.h5"
CORE_DATASET_NAMES = [
......@@ -161,6 +168,7 @@ rule test:
DATA + "/assertions/assertions.csv",
DATA + "/psql/done",
DATA + "/assoc/reduced.csv",
DATA + "/vectors/mini.h5",
DATA + "/vectors/plain/numberbatch-en.txt.gz",
......@@ -730,7 +738,7 @@ rule debias:
rule miniaturize:
input:
DATA + "/vectors/numberbatch-biased.h5",
DATA + "/vectors/w2v-google-news.h5"
DATA + MINI_VOCAB_SOURCE
output:
DATA + "/vectors/mini.h5"
resources:
......
......@@ -122,7 +122,13 @@ class VectorSpaceWrapper(object):
@staticmethod
def _englishify(term):
"""
Change the language of a /c/ term to English. If the input isn't a term,
return None.
"""
splits = split_uri(term)
if not term.startswith('/c/'):
return None
if len(splits) > 2:
englishified = '/c/en/' + splits[2]
return englishified
......@@ -188,7 +194,8 @@ class VectorSpaceWrapper(object):
prefix_weight = 0.01
if get_uri_language(term) != 'en':
englishified = self._englishify(term)
expanded.append((englishified, prefix_weight))
if englishified is not None:
expanded.append((englishified, prefix_weight))
prefix_matches = self._match_prefix(term, prefix_weight)
expanded.extend(prefix_matches)
......
from conceptnet5 import api
from nose.tools import eq_
def test_related_query():
# Test that we can look up related terms
result = api.query_related('/c/en/test', limit=3)
eq_(len(result['related']), 3)
def test_related_query_malformed():
# Test that we fulfill a query for related terms to a nonsense URI, and
# there are simply no results
result = api.query_related('/c/en,test', limit=3)
eq_(len(result['related']), 0)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment