Commit 593de3a4 authored by Robyn Speer's avatar Robyn Speer
Browse files

Merge branch 'data-updates-5.7' into version5.7

parents fe3fbf99 65f0c03d
......@@ -24,14 +24,10 @@ UPLOAD = False
# from Morfessor.
USE_MORPHOLOGY = False
# How many pieces to split edge files into. (Works best when it's a power of
# 2 that's 64 or less.)
N_PIECES = 16
# The versions of Wiktionary data to download. Updating these requires
# uploading new Wiktionary dumps to ConceptNet's S3.
WIKTIONARY_VERSIONS = {
'en': '20171201',
'en': '20190101',
'fr': '20160305',
'de': '20160407'
}
......@@ -43,15 +39,19 @@ ATOMIC_SPACE_LANGUAGES = {'vi'}
# Languages that the CLDR emoji data is available in. These match the original
# filenames, not ConceptNet language codes; they are turned into ConceptNet
# language codes by the reader.
#
# This list is the list of languages with emoji names in CLDR v34, but
# skipping the en_GB file, which is empty and causes an error.
EMOJI_LANGUAGES = [
'af', 'am', 'ar', 'as', 'ast', 'az', 'be', 'bg', 'bn', 'bs', 'ca', 'chr', 'cs', 'cy', 'da',
'de', 'de_CH', 'el', 'en', 'en_001', 'en_AU', 'en_CA', 'en_GB', 'es', 'es_419', 'es_MX',
'es_US', 'et', 'eu', 'fa', 'fi', 'fil', 'fo', 'fr', 'fr_CA', 'ga', 'gd', 'gl', 'gu', 'he',
'hi', 'hr', 'hu', 'hy', 'id', 'is', 'it', 'ja', 'ka', 'kab', 'kk', 'km', 'kn', 'ko', 'ky',
'lo', 'lt', 'lv', 'mk', 'ml', 'mn', 'mr', 'ms', 'my', 'nb', 'ne', 'nl', 'nn', 'or', 'pa',
'pl', 'ps', 'pt', 'pt_PT', 'ro', 'ru', 'sd', 'si', 'sk', 'sl', 'sq', 'sr', 'sr_Latn', 'sv',
'sw', 'ta', 'te', 'th', 'tk', 'to', 'tr', 'uk', 'ur', 'uz', 'vi', 'yue', 'yue_Hans', 'zh',
'zh_Hant', 'zh_Hant_HK', 'zu'
'af', 'am', 'ar', 'ar_SA', 'as', 'ast', 'az', 'be', 'bg', 'bn', 'br', 'bs', 'ca', 'ccp',
'chr', 'cs', 'cy', 'da', 'de', 'de_CH', 'el', 'en', 'en_001', 'en_AU', 'en_CA',
'es', 'es_419', 'es_MX', 'es_US', 'et', 'eu', 'fa', 'fi', 'fil', 'fo', 'fr', 'fr_CA', 'ga',
'gd', 'gl', 'gu', 'he', 'hi', 'hr', 'hu', 'hy', 'ia', 'id', 'is', 'it', 'ja', 'ka', 'kab',
'kk', 'km', 'kn', 'ko', 'ku', 'ky', 'lo', 'lt', 'lv', 'mk', 'ml', 'mn', 'mr', 'ms', 'my',
'nb', 'ne', 'nl', 'nn', 'or', 'pa', 'pl', 'ps', 'pt', 'pt_PT', 'ro', 'ru', 'sd', 'si', 'sk',
'sl', 'sq', 'sr', 'sr_Cyrl', 'sr_Cyrl_BA', 'sr_Latn', 'sr_Latn_BA', 'sv', 'sw', 'ta', 'te',
'th', 'tk', 'to', 'tr', 'uk', 'ur', 'uz', 'vi', 'yue', 'yue_Hans', 'zh', 'zh_Hant',
'zh_Hant_HK', 'zu'
]
# Increment this number when we incompatibly change the parser
......@@ -70,7 +70,7 @@ PROPAGATE_SHARDS = 6
# that will mainly be used to find more information about those terms.
RAW_DATA_URL = "https://zenodo.org/record/1165009/files/conceptnet-raw-data-5.6.zip"
RAW_DATA_URL = "https://zenodo.org/record/1165009/files/conceptnet-raw-data-5.7.zip"
PRECOMPUTED_DATA_PATH = "/precomputed-data/2016"
PRECOMPUTED_DATA_URL = "https://conceptnet.s3.amazonaws.com" + PRECOMPUTED_DATA_PATH
PRECOMPUTED_S3_UPLOAD = "s3://conceptnet" + PRECOMPUTED_DATA_PATH
......@@ -171,20 +171,20 @@ rule test:
# ===========
rule download_raw_package:
output:
DATA + "/raw/conceptnet-raw-data-5.6.zip"
DATA + "/raw/conceptnet-raw-data-5.7.zip"
shell:
"wget -nv {RAW_DATA_URL} -O {output}"
# Get emoji data directly from Unicode CLDR
rule download_unicode_data:
output:
DATA + "/raw/cldr-common-32.0.1.zip"
DATA + "/raw/cldr-common-34.0.zip"
shell:
"wget -nv http://unicode.org/Public/cldr/32.0.1/cldr-common-32.0.1.zip -O {output}"
"wget -nv http://unicode.org/Public/cldr/34/cldr-common-34.0.zip -O {output}"
rule extract_raw:
input:
DATA + "/raw/conceptnet-raw-data-5.6.zip"
DATA + "/raw/conceptnet-raw-data-5.7.zip"
output:
DATA + "/raw/{dirname}/{filename}"
shell:
......@@ -192,9 +192,11 @@ rule extract_raw:
# This rule takes precedence over extract_raw, extracting the emoji data from
# the Unicode CLDR zip file.
#
# TODO: integrate this with the rest of the raw data
rule extract_emoji_data:
input:
DATA + "/raw/cldr-common-32.0.1.zip"
DATA + "/raw/cldr-common-34.0.zip"
output:
DATA + "/raw/emoji/{filename}"
shell:
......
__version__ = '5.6.2'
__version__ = '5.7.0'
......@@ -11,11 +11,17 @@ from conceptnet5.languages import ALL_LANGUAGES
from conceptnet5.readers.wiktionary import valid_language
from conceptnet5.uri import conjunction_uri,get_uri_language, is_absolute_url, Licenses, \
split_uri, uri_prefix
from conceptnet5.util import get_support_data_filename
N = 100
CURRENT_DIR = os.getcwd()
def get_blacklist():
filename = get_support_data_filename('blacklist.txt')
return set(open(filename).readlines())
def weight_scale(weight):
"""
This scale starts out linear, then switches to a square-root scale at x=2.
......@@ -119,15 +125,19 @@ def combine_assertions(input_filename, output_filename):
out = MsgpackStreamWriter(output_filename)
out_bad = MsgpackStreamWriter(output_filename + '.reject')
blacklist = get_blacklist()
with open(input_filename, encoding='utf-8') as stream:
for key, line_group in itertools.groupby(stream, group_func):
assertion = make_assertion(line_group)
destination = out
if assertion is None:
continue
if assertion['weight'] > 0:
destination = out
else:
if assertion['weight'] <= 0:
destination = out_bad
for value in assertion.values():
if isinstance(value, str) and value in blacklist:
destination = out_bad
destination.write(assertion)
out.close()
......
......@@ -27,10 +27,15 @@ def handle_file(input_file, output_file):
out = MsgpackStreamWriter(output_file)
root = tree.getroot()
lang = root[0][1].attrib['type'] # language is at position [1] within the child node [0]
for annotation in root[1]:
for word in strip_words(annotation.text):
start = standardized_concept_uri('mul', annotation.attrib['cp'])
end = standardized_concept_uri(lang, word)
edge = make_edge(REL, start, end, DATASET, LICENSE, SOURCE)
out.write(edge)
if len(root) >= 2:
for annotation in root[1]:
for word in strip_words(annotation.text):
start = standardized_concept_uri('mul', annotation.attrib['cp'])
end = standardized_concept_uri(lang, word)
edge = make_edge(REL, start, end, DATASET, LICENSE, SOURCE)
out.write(edge)
else:
print("No emoji data in {!r}".format(input_file))
out.close()
# We should be able to address particular bad assertions by removing them.
# Here's one to start with.
/a/[/r/HasSubevent/,/c/en/get_drunk/,/c/en/drive_carefully/]
......@@ -5,7 +5,7 @@ from setuptools.command.develop import develop
import sys
packages = find_packages()
version_str = '5.6.4'
version_str = '5.7.0'
if sys.version_info.major < 3:
print("The ConceptNet 5 code can only run in Python 3.")
......
{
"comment": "Hey, you've found the JSON-LD context for ConceptNet. This file defines everything that appears in ConceptNet API responses, mostly for the benefit of software that understands JSON-LD, but it may be reasonably human-readable too. See http://www.conceptnet.io for more information about ConceptNet, http://api.conceptnet.io/docs for the API documentation, or http://json-ld.org/ for an introduction to JSON-LD.",
"definitions": [
{
"comment": "This section defines the types and properties used in the ConceptNet API in terms of other things in RDF. A JSON-LD processor won't actually use this section; it only cares about the more specific things defined below in the '@context' section. But I hope it's a good formal description of what's going on in the ConceptNet API, and if you interpret _this_ part with JSON-LD, you'll get out a bunch of RDF facts that could be useful if there's some sort of big Semantic Web revival."
},
{
"@id": "#Node",
"@type": "rdfs:Datatype",
"subClassOf": "#Query",
"comment": "A node in ConceptNet typically represents a word or phrase of natural language. A node can be tagged with a word sense to narrow down its meaning, or it can be left ambiguous and represent all meanings of the word or phrase."
},
{
"@id": "#Relation",
"@type": "rdfs:Datatype",
"subClassOf": "#Query",
"comment": "One of a fixed vocabulary of relations, indicating how two nodes are related. Examples include '/r/UsedFor' and '/r/Synonym'."
},
{
"@id": "#Edge",
"@type": "rdfs:Datatype",
"subClassOf": "rdf:Statement",
"comment": "Each edge in ConceptNet represents a fact of general knowledge. The edge can also be interpreted as an RDF statement, with a subject, predicate, and object."
},
{
"@id": "#Feature",
"@type": "rdfs:Datatype",
"subClassOf": "rdf:Resource",
"comment": "A Feature is a pattern that edges can match, specifying the relation and _one_ node. That node can be the 'start', 'end', or simply the 'node' of a symmetric relation."
},
{
"@id": "#Query",
"@type": "rdfs:Datatype",
"subClassOf": "rdf:Resource",
"comment": "A Query is a set of results that you can look up in the API. Each Node represents a Query for what edges are connected to that node, but other queries are possible, such as all edges with a particular start node and relation."
},
{
"@id": "#Source",
"@type": "rdfs:Datatype",
"subClassOf": "rdf:Resource",
"comment": "A Source is a reason to believe an Edge. It helps us track the provenance of where the edge came from, and judge whether it should be considered reliable."
},
{
"@id": "#RelatedNode",
"@type": "rdfs:Datatype",
"comment": "A node that is related to a query. Contains the '@id' of the related node, and the 'weight' for how related it is."
},
{
"@id": "#edges",
"@type": "rdf:Property",
"domain": "#Edge",
"range": "#Relation",
"comment": "When you look up a node, its 'edges' property is a list of (some of) its incoming and outgoing edges. NOTE: Edge lists are paginated! By default you only get 20 edges, so you should follow the links in the 'pages:view' to get more."
},
{
"@id": "#rel",
"@type": "rdf:Property",
"subPropertyOf": "rdf:predicate",
"domain": ["#Edge", "#Feature"],
"range": "#Relation",
"comment": "Links to the kind of relationship that holds between two terms. In this API, the 'rel' will always be a ConceptNet URI beginning with /r/. In RDF, this would be called the 'predicate'."
},
{
"@id": "#start",
"@type": "rdf:Property",
"subPropertyOf": "rdf:subject",
"domain": ["#Edge", "#Feature"],
"range": "#Node",
"comment": "Links to the node that this edge points from. In RDF, this would be called the 'subject'."
},
{
"@id": "#end",
"@type": "rdf:Property",
"subPropertyOf": "rdf:object",
"domain": ["#Edge", "#Feature"],
"range": "#Node",
"comment": "Links to the node that this edge points to. In RDF, this would be called the 'object'."
},
{
"@id": "#symmetric",
"@type": "rdf:Property",
"domain": ["#Edge", "#Relation"],
"range": "xsd:boolean",
"comment": "A relation or edge can be 'symmetric'. When this boolean value is true, it indicates that it doesn't matter which node is the 'start' or 'end'."
},
{
"@id": "#weight",
"@type": "rdf:Property",
"domain": ["#Edge", "#RelatedNode"],
"range": "xsd:float",
"comment": "A numerical value indicating how strongly one should believe the statement this edge makes. Weights are set in an ad-hoc way by the modules that import data into ConceptNet. Weights also appear on RelatedNodes, indicating how related that node is to the query, on a scale from -1 to 1."
},
{
"@id": "#value",
"@type": "rdf:Property",
"domain": "#Query"],
"range": "xsd:float",
"comment": "A value indicating the result of a numerical query, such as the /relatedness query for how related two nodes are."
},
{
"@id": "#node",
"@type": "rdf:Property",
"domain": ["#Edge", "#Feature"],
"range": "#Node",
"comment": "Sometimes we want to specify that a ConceptNet edge either starts or ends at a certain node, but it doesn't matter which. This is the case for symmetric relations, such as /r/Synonym. In those cases, we can refer to either node with the 'node' property. Not to be confused with 'cn:Node', which is a data type."
},
{
"@id": "#features",
"@type": "rdf:Property",
"domain": "#Query",
"range": "#Query",
"comment": "API responses can be grouped into 'features' (see the comment for the type #Feature) based on what they describe about the node being queried. In a grouped API response, the 'features' property is a list of all these groups. Each group is a smaller Query with a 'feature' property."
},
{
"@id": "#feature",
"@type": "rdf:Property",
"domain": "#Query",
"range": "#Feature",
"comment": "When this property is present, the query is selecting edges that match a particular feature (see the comment for #Feature)."
},
{
"@id": "#label",
"@type": "rdf:Property",
"subPropertyOf": "rdfs:label",
"domain": "#Node",
"range": "xsd:string",
"comment": "The natural-language label of a node. Every node with a 'label' will also have a 'language' containing the BCP 47 language code for the language it's in. The 'language' isn't a property we define ourselves, it's just an alias for the JSON-LD keyword '@language'."
},
{
"@id": "#sense_label",
"@type": "rdf:Property",
"domain": "#Node",
"range": "xsd:string",
"comment": "A URL-safe string that can distinguish multiple senses of a word. Often this is just a part-of-speech label, such as 'n' or 'v'."
},
{
"@id": "#term",
"@type": "rdf:Property",
"domain": "#Node",
"range": "#Node",
"comment": "The 'term' property links a node to its plain, possibly ambiguous form, without any sense label attached to it. If there wasn't a sense label, then the node's 'term' will link to itself."
},
{
"@id": "#site",
"@type": "rdf:Property",
"domain": "#Node",
"range": "xsd:string",
"comment": "ConceptNet has 'ExternalURL' edges that point to terms in other Linked Data resources. The '@id' of such a term contains its complete URL, where you may be able to find more data. The 'site' property contains just the domain name of the resource."
},
{
"@id": "#related",
"@type": "rdf:Property",
"domain": "#Query",
"range": "#RelatedNode",
"comment": "A list returned when you make a '/related' query, listing the nodes that are most related to the query according to the ConceptNet Numberbatch term vectors. Each node is expressed as a RelatedNode object, with an @id and a weight."
},
{
"@id": "#sources",
"@type": "rdf:Property",
"domain": "#Edge",
"range": "#Source",
"comment": "The 'sources' of an edge are a set of independent reasons we believe this assertion. Edges with more than one source are more reliable. Each of these individual sources is identified by an '@id', and can have a 'contributor', a 'process', and/or an 'activity' identifying more specifically where the data came from. If it only takes one of those to describe the source, then its @id will also be the @id of the source."
},
{
"@id": "#contributor",
"@type": "rdf:Property",
"domain": "#Source",
"range": "rdfs:Resource",
"comment": "A property of a source, indicating the person or resource that contributed an edge in ConceptNet."
},
{
"@id": "#process",
"@type": "rdf:Property",
"domain": "#Source",
"range": "rdfs:Resource",
"comment": "A property of a source, indicating a computational process that led to an edge in ConceptNet."
},
{
"@id": "#activity",
"@type": "rdf:Property",
"domain": "#Source",
"range": "rdfs:Resource",
"comment": "A property of a source, identifying a crowd-sourcing activity that led to an edge in ConceptNet."
},
{
"@id": "#dataset",
"@type": "rdf:Property",
"domain": "#Edge",
"range": "rdfs:Resource",
"comment": "A property of an edge, separating edges broadly into different 'datasets' that came from different sources or processes."
},
{
"@id": "#surfaceText",
"@type": "rdf:Property",
"domain": "#Edge",
"range": "xsd:string",
"comment": "The natural language text that corresponds to an edge. If both nodes attached to the edge are in the same language, the surfaceText will be in that language. We may add a property in the future that more helpfully distinguishes the language of these surface texts."
},
{
"@id": "#license",
"@type": "rdf:Property",
"domain": ["#Edge", "#Query"],
"range": "https://creativecommons.org/ns#License",
"comment": "A link to the Creative Commons license under which you can remix or redistribute this information."
},
{
"@id": "pages:PartialCollectionView",
"@type": "rdfs:Datatype",
"comment": "An object containing links to more pages of results. There's no single standard for this, but we vaguely follow the recommendation at https://www.w3.org/community/hydra/wiki/Pagination#PartialCollectionView."
},
{
"@id": "pages:view",
"@type": "rdf:Property",
"domain": "#Query",
"range": "pages:PartialCollectionView",
"comment": "Appears on a response that returns more edges than fit in the response. Contains links to more pages of results."
},
{
"@id": "pages:paginatedProperty",
"@type": "rdf:Property",
"domain": "pages:PartialCollectionView",
"range": "rdf:Property",
"comment": "Indicates which property -- such as 'edges' -- contains the list that's being paginated."
},
{
"@id": "pages:firstPage",
"@type": "rdf:Property",
"domain": "pages:PartialCollectionView",
"range": "#Query",
"comment": "A link to the first page of results."
},
{
"@id": "pages:nextPage",
"@type": "rdf:Property",
"domain": "pages:PartialCollectionView",
"range": "#Query",
"comment": "A link to the next page of results. Only present if there is a next page."
},
{
"@id": "pages:previousPage",
"@type": "rdf:Property",
"domain": "pages:PartialCollectionView",
"range": "#Query",
"comment": "A link to the previous page of results. Only present if there is a previous page."
}
],
"@context": {
"@base": "http://api.conceptnet.io/ld/conceptnet5.7/context.ld.json",
"cn": "http://api.conceptnet.io/ld/conceptnet5.7/context.ld.json#",
"pages": "http://api.conceptnet.io/ld/conceptnet5.7/context.ld.json#pagination-",
"cc": "http://creativecommons.org/licenses/",
"dc": "http://purl.org/dc/terms/",
"rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
"rdfs": "http://www.w3.org/2000/01/rdf-schema#",
"vann": "http://purl.org/vocab/vann/",
"xsd": "http://www.w3.org/2001/XMLSchema#",
"Node": "cn:Node",
"Edge": "cn:Edge",
"Relation": "cn:Relation",
"Source": "cn:Source",
"RelatedNode": "cn:RelatedNode",
"PartialCollectionView": "pages:PartialCollectionView",
"comment": {"@id": "rdfs:comment", "@type": "xsd:string"},
"definitions": {"@id": "vann:termGroup", "@type": "@id"},
"seeAlso": {"@id": "rdfs:seeAlso", "@type": "@id"},
"domain": {"@id": "rdfs:domain", "@type": "@id"},
"range": {"@id": "rdfs:range", "@type": "@id"},
"subClassOf": {"@id": "rdfs:subClassOf", "@type": "@id"},
"subPropertyOf": {"@id": "rdfs:subPropertyOf", "@type": "@id"},
"rel": {"@id": "cn:rel", "@type": "@id"},
"start": {"@id": "cn:start", "@type": "@id"},
"end": {"@id": "cn:end", "@type": "@id"},
"symmetric": {"@id": "cn:symmetric", "@type": "xsd:boolean"},
"weight": {"@id": "cn:weight", "@type": "xsd:float"},
"value": {"@id": "cn:value", "@type": "xsd:float"},
"node": {"@id": "cn:node", "@type": "@id"},
"feature": {"@id": "cn:feature", "@type": "@id"},
"features": {"@id": "cn:features", "@container": "@set", "@type": "@id"},
"label": {"@id": "cn:label", "@type": "xsd:string"},
"language": "@language",
"sense_label": {"@id": "cn:sense_label", "@type": "xsd:string"},
"term": {"@id": "cn:term", "@type": "@id"},
"site": {"@id": "cn:site", "@type": "xsd:string"},
"edges": {"@id": "cn:edges", "@container": "@set", "@type": "cn:Edge"},
"related": {"@id": "cn:related", "@container": "@list", "@type": "@id"},
"sources": {"@id": "cn:source", "@container": "@set", "@type": "@id"},
"contributor": {"@id": "dc:contributor", "@type": "@id"},
"process": {"@id": "cn:process", "@type": "@id"},
"activity": {"@id": "cn:activity", "@type": "@id"},
"dataset": {"@id": "cn:dataset", "@type": "@id"},
"surfaceText": {"@id": "cn:surfaceText", "@type": "xsd:string"},
"license": {"@id": "cn:license", "@type": "@id"},
"view": {"@id": "pages:view", "@type": "pages:PartialCollectionView"},
"paginatedProperty": {"@id": "pages:paginatedProperty", "@type": "@vocab"},
"firstPage": {"@id": "pages:firstPage", "@type": "@id"},
"nextPage": {"@id": "pages:nextPage", "@type": "@id"},
"previousPage": {"@id": "pages:previousPage", "@type": "@id"}
},
"@id": "cn:",
"vann:preferredNamespacePrefix": "cn",
"dc:creator": "rspeer@luminoso.com",
"seeAlso": "http://api.conceptnet.io/docs"
}
......@@ -14,7 +14,7 @@
{% endif %}
</h1>
<h2 class="subtitle">
Results from <a href="/" class="version">ConceptNet 5.6</a>
Results from <a href="/" class="version">ConceptNet 5.7</a>
</h2>
<div class="sources">
{{ sources | describe_sources_brief }}
......
......@@ -7,7 +7,7 @@
{{ show_term(term, 'h1') }}
<h2 class="subtitle">
{{term.language|describe_term_language}} in
<a href="/" class="version">ConceptNet 5.6</a>
<a href="/" class="version">ConceptNet 5.7</a>
</h2>
<div class="sources">
{{ sources | describe_sources_brief }}
......
__version__ = '5.6.0'
__version__ = '5.7.0'
......@@ -5,7 +5,7 @@ from setuptools.command.develop import develop
import sys
packages = find_packages()
version_str = '5.6.4'
version_str = '5.7.0'
if sys.version_info.major < 3:
print("The ConceptNet 5 code can only run in Python 3.")
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment