uri.py 12.4 KB
Newer Older
1
2
3
4
5
"""
URIs are Unicode strings that represent the canonical name for any object in
ConceptNet. These can be used with the ConceptNet Web API, or referred to in a
Semantic Web application, by attaching the prefix:

6
    http://api.conceptnet.io
7
8

For example, the English concept "book" has the URI '/c/en/book'. This concept
9
can be referred to, or retrieved, using this complete URI:
10

11
    http://api.conceptnet.io/c/en/book
12
13
"""

Rob Speer's avatar
Rob Speer committed
14

15
def standardize_text(text, lowercase=True):
16
17
    raise NotImplementedError(
        "This function has been superseded by "
18
        "conceptnet5.nodes.preprocess_and_tokenize_text."
19
    )
20
21


22
23
def join_uri(*pieces):
    """
24
25
    `join_uri` builds a URI from constituent pieces that should be joined with
    slashes (/).
26

27
28
29
    Leading and trailing on the pieces are acceptable, but will be ignored. The
    resulting URI will always begin with a slash and have its pieces separated
    by a single slash.
30

31
32
33
    The pieces do not have `preprocess_and_tokenize_text` applied to them; to
    make sure your URIs are in normal form, run `preprocess_and_tokenize_text`
    on each piece that represents arbitrary text.
34

35
36
    >>> join_uri('/c', 'en', 'cat')
    '/c/en/cat'
37

38
39
    >>> join_uri('c', 'en', ' spaces ')
    '/c/en/ spaces '
40

41
42
    >>> join_uri('/r/', 'AtLocation/')
    '/r/AtLocation'
43

44
45
    >>> join_uri('/test')
    '/test'
46

47
48
    >>> join_uri('test')
    '/test'
Rob Speer's avatar
Rob Speer committed
49

50
51
    >>> join_uri('/test', '/more/')
    '/test/more'
52
53
54
55
56
    """
    joined = '/' + ('/'.join([piece.strip('/') for piece in pieces]))
    return joined


Rob Speer's avatar
Rob Speer committed
57
def concept_uri(lang, text, *more):
58
59
60
61
    """
    `concept_uri` builds a representation of a concept, which is a word or
    phrase of a particular language, which can participate in relations with
    other concepts, and may be linked to concepts in other languages.
Rob Speer's avatar
Rob Speer committed
62

63
64
65
66
67
    Every concept has an ISO language code and a text. It may also have a part
    of speech (pos), which is typically a single letter. If it does, it may
    have a disambiguation, a string that distinguishes it from other concepts
    with the same text.

Rob Speer's avatar
Rob Speer committed
68
69
70
71
72
    This function should be called as follows, where arguments after `text`
    are optional:

        concept_uri(lang, text, pos, disambiguation...)

73
    `text` and `disambiguation` should be strings that have already been run
74
    through `preprocess_and_tokenize_text`.
Rob Speer's avatar
Rob Speer committed
75

76
    This is a low-level interface. See `standardized_concept_uri` in nodes.py for
77
78
79
80
81
82
83
84
85
86
87
88
89
    a more generally applicable function that also deals with special
    per-language handling.

    >>> concept_uri('en', 'cat')
    '/c/en/cat'
    >>> concept_uri('en', 'cat', 'n')
    '/c/en/cat/n'
    >>> concept_uri('en', 'cat', 'n', 'feline')
    '/c/en/cat/n/feline'
    >>> concept_uri('en', 'this is wrong')
    Traceback (most recent call last):
        ...
    AssertionError: 'this is wrong' is not in normalized form
90
    """
91
    assert ' ' not in text, "%r is not in normalized form" % text
Rob Speer's avatar
Rob Speer committed
92
93
94
95
96
97
    if len(more) > 0:
        if len(more[0]) != 1:
            # We misparsed a part of speech; everything after the text is
            # probably junk
            more = []
        for dis1 in more[1:]:
98
            assert ' ' not in dis1, "%r is not in normalized form" % dis1
Rob Speer's avatar
Rob Speer committed
99
100

    return join_uri('/c', lang, text, *more)
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118


def compound_uri(op, args):
    """
    Some URIs represent a compound structure or operator built out of a number
    of arguments. Some examples are the '/and' and '/or' operators, which
    represent a conjunction or disjunction over two or more URIs, which may
    themselves be compound URIs; or the assertion structure, '/a', which takes
    a relation and two URIs as its arguments.

    This function takes the main 'operator', with the slash included, and an
    arbitrary number of arguments, and produces the URI that represents the
    entire compound structure.

    These structures contain square brackets as segments, which look like
    `/[/` and `/]/`, so that compound URIs can contain other compound URIs
    without ambiguity.

119
120
121
122
    >>> compound_uri('/nothing', [])
    '/nothing/[/]'
    >>> compound_uri('/a', ['/r/CapableOf', '/c/en/cat', '/c/en/sleep'])
    '/a/[/r/CapableOf/,/c/en/cat/,/c/en/sleep/]'
123
124
125
126
127
128
129
130
131
132
133
134
135
136
    """
    items = [op]
    first_item = True
    items.append('[')
    for arg in args:
        if first_item:
            first_item = False
        else:
            items.append(',')
        items.append(arg)
    items.append(']')
    return join_uri(*items)


137
138
139
def split_uri(uri):
    """
    Get the slash-delimited pieces of a URI.
Rob Speer's avatar
Rob Speer committed
140

141
142
    >>> split_uri('/c/en/cat/n/animal')
    ['c', 'en', 'cat', 'n', 'animal']
143
144
    >>> split_uri('/')
    []
145
    """
146
147
    if not uri.startswith('/'):
        return [uri]
148
149
150
    uri2 = uri.lstrip('/')
    if not uri2:
        return []
151
    return uri2.split('/')
152
153


Rob Speer's avatar
Rob Speer committed
154
def uri_prefix(uri, max_pieces=3):
155
    """
156
157
    Strip off components that might make a ConceptNet URI too detailed. Only
    the first `max_pieces` components will be kept.
158
159

    By default, `max_pieces` is 3, making this function useful for converting
160
161
162
    disambiguated concepts into their more general ambiguous forms.

    If the URI is actually a fully qualified URL, no components are removed.
163
164
165
166
167
168
169
170
171
172
173

    >>> uri_prefix('/c/en/cat/n/animal')
    '/c/en/cat'
    >>> uri_prefix('/c/en/cat/n')
    '/c/en/cat'
    >>> uri_prefix('/c/en/cat')
    '/c/en/cat'
    >>> uri_prefix('/c/en')
    '/c/en'
    >>> uri_prefix('/c/en/cat', 2)
    '/c/en'
174
175
    >>> uri_prefix('http://en.wikipedia.org/wiki/Example')
    'http://en.wikipedia.org/wiki/Example'
176
    """
177
178
    if is_absolute_url(uri):
        return uri
Rob Speer's avatar
Rob Speer committed
179
180
181
182
    pieces = split_uri(uri)[:max_pieces]
    return join_uri(*pieces)


Rob Speer's avatar
Rob Speer committed
183
184
185
186
187
188
def uri_prefixes(uri, min_pieces=2):
    """
    Get URIs that are prefixes of a given URI: that is, they begin with the
    same path components. By default, the prefix must have at least 2
    components.

Rob Speer's avatar
Rob Speer committed
189
190
191
    If the URI has sub-parts that are grouped by square brackets, then
    only complete sub-parts will be allowed in prefixes.

192
193
    >>> list(uri_prefixes('/c/en/cat/n/animal'))
    ['/c/en', '/c/en/cat', '/c/en/cat/n', '/c/en/cat/n/animal']
Rob Speer's avatar
Rob Speer committed
194
195
    >>> list(uri_prefixes('/test/[/group/one/]/[/group/two/]'))
    ['/test/[/group/one/]', '/test/[/group/one/]/[/group/two/]']
196
197
    >>> list(uri_prefixes('http://en.wikipedia.org/wiki/Example'))
    ['http://en.wikipedia.org/wiki/Example']
Rob Speer's avatar
Rob Speer committed
198
    """
199
    if is_absolute_url(uri):
200
201
        yield uri
        return
Rob Speer's avatar
Rob Speer committed
202
203
204
205
    pieces = []
    for piece in split_uri(uri):
        pieces.append(piece)
        if len(pieces) >= min_pieces:
Rob Speer's avatar
Rob Speer committed
206
207
            if pieces.count('[') == pieces.count(']'):
                yield join_uri(*pieces)
Rob Speer's avatar
Rob Speer committed
208
209


210
211
212
213
def parse_compound_uri(uri):
    """
    Given a compound URI, extract its operator and its list of arguments.

214
215
216
217
218
219
    >>> parse_compound_uri('/nothing/[/]')
    ('/nothing', [])
    >>> parse_compound_uri('/a/[/r/CapableOf/,/c/en/cat/,/c/en/sleep/]')
    ('/a', ['/r/CapableOf', '/c/en/cat', '/c/en/sleep'])
    >>> parse_compound_uri('/or/[/and/[/s/one/,/s/two/]/,/and/[/s/three/,/s/four/]/]')
    ('/or', ['/and/[/s/one/,/s/two/]', '/and/[/s/three/,/s/four/]'])
220
    """
221
    pieces = split_uri(uri)
222
223
224
    if pieces[-1] != ']':
        raise ValueError("Compound URIs must end with /]")
    if '[' not in pieces:
225
        raise ValueError(
226
            "Compound URIs must contain /[/ at the beginning of the argument list"
227
        )
228
229
230
231
232
233
    list_start = pieces.index('[')
    op = join_uri(*pieces[:list_start])

    chunks = []
    current = []
    depth = 0
234
235

    # Split on commas, but not if they're within additional pairs of brackets.
236
    for piece in pieces[(list_start + 1) : -1]:
237
238
239
240
241
242
243
244
245
        if piece == ',' and depth == 0:
            chunks.append('/' + ('/'.join(current)).strip('/'))
            current = []
        else:
            current.append(piece)
            if piece == '[':
                depth += 1
            elif piece == ']':
                depth -= 1
246
247

    assert depth == 0, "Unmatched brackets in %r" % uri
248
249
250
251
252
    if current:
        chunks.append('/' + ('/'.join(current)).strip('/'))
    return op, chunks


253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
def parse_possible_compound_uri(op, uri):
    """
    The AND and OR conjunctions can be expressed as compound URIs, but if they
    contain only one thing, they are returned as just that single URI, not a
    compound.

    This function returns the list of things in the compound URI if its operator
    matches `op`, or a list containing the URI itself if not.

    >>> parse_possible_compound_uri(
    ...    'or', '/or/[/and/[/s/one/,/s/two/]/,/and/[/s/three/,/s/four/]/]'
    ... )
    ['/and/[/s/one/,/s/two/]', '/and/[/s/three/,/s/four/]']
    >>> parse_possible_compound_uri('or', '/s/contributor/omcs/dev')
    ['/s/contributor/omcs/dev']
    """
    if uri.startswith('/' + op + '/'):
        return parse_compound_uri(uri)[1]
    else:
        return [uri]


275
276
277
def conjunction_uri(*sources):
    """
    Make a URI representing a conjunction of sources that work together to provide
Rob Speer's avatar
Rob Speer committed
278
    an assertion. The sources will be sorted in lexicographic order.
279

280
281
    >>> conjunction_uri('/s/contributor/omcs/dev')
    '/s/contributor/omcs/dev'
Rob Speer's avatar
Rob Speer committed
282

283
284
    >>> conjunction_uri('/s/rule/some_kind_of_parser', '/s/contributor/omcs/dev')
    '/and/[/s/contributor/omcs/dev/,/s/rule/some_kind_of_parser/]'
285
286
287
    """
    if len(sources) == 0:
        # Logically, a conjunction with 0 inputs represents 'True', a
288
289
290
291
        # proposition that cannot be denied. This could be useful as a
        # justification for, say, mathematical axioms, but when it comes to
        # ConceptNet, that kind of thing makes us uncomfortable and shouldn't
        # appear in the data.
292
293
294
295
        raise ValueError("Conjunctions of 0 things are not allowed")
    elif len(sources) == 1:
        return sources[0]
    else:
Rob Speer's avatar
Rob Speer committed
296
        return compound_uri('/and', sorted(set(sources)))
297
298


299
def assertion_uri(rel, start, end):
300
    """
301
302
    Make a URI for an assertion, as a compound URI of its relation, start node,
    and end node.
Rob Speer's avatar
Rob Speer committed
303

304
305
    >>> assertion_uri('/r/CapableOf', '/c/en/cat', '/c/en/sleep')
    '/a/[/r/CapableOf/,/c/en/cat/,/c/en/sleep/]'
Rob Speer's avatar
Rob Speer committed
306
    """
Rob Speer's avatar
Rob Speer committed
307
    assert rel.startswith('/r'), rel
308
    return compound_uri('/a', (rel, start, end))
309

310

311
def is_concept(uri):
jlowryduda's avatar
jlowryduda committed
312
313
314
315
316
317
318
319
    """
    >>> is_concept('/c/sv/klänning')
    True
    >>> is_concept('/x/en/ly')
    False
    >>> is_concept('/a/[/r/Synonym/,/c/ro/funcția_beta/,/c/en/beta_function/]')
    False
    """
320
321
322
    return uri.startswith('/c/')


323
324
325
326
327
328
329
330
331
332
def is_relation(uri):
    """
    >>> is_relation('/r/IsA')
    True
    >>> is_relation('/c/sv/klänning')
    False
    """
    return uri.startswith('/r/')


333
def is_term(uri):
jlowryduda's avatar
jlowryduda committed
334
335
336
337
338
339
340
341
    """
    >>> is_term('/c/sv/kostym')
    True
    >>> is_term('/x/en/ify')
    True
    >>> is_term('/a/[/r/RelatedTo/,/c/en/cake/,/c/en/flavor/]')
    False
    """
342
343
344
    return uri.startswith('/c/') or uri.startswith('/x/')


345
def is_absolute_url(uri):
jlowryduda's avatar
jlowryduda committed
346
    """
347
348
349
350
    We have URLs pointing to Creative Commons licenses, starting with 'cc:',
    which for Linked Data purposes are absolute URLs because they'll be resolved
    into full URLs.

jlowryduda's avatar
jlowryduda committed
351
352
353
354
355
    >>> is_absolute_url('http://fr.wiktionary.org/wiki/mįkká’e_uxpáðe')
    True
    >>> is_absolute_url('/c/fr/nouveau')
    False
    """
356
    return uri.startswith('http') or uri.startswith('cc:')
357
358


jlowryduda's avatar
jlowryduda committed
359
360
361
362
def get_uri_language(uri):
    """
    Extract the language from a concept URI. If the URI points to an assertion,
    get the language of its first concept.
363

jlowryduda's avatar
jlowryduda committed
364
365
366
367
368
369
370
371
372
373
374
375
376
377
    >>> get_uri_language('/a/[/r/RelatedTo/,/c/en/orchestra/,/c/en/symphony/]')
    'en'
    >>> get_uri_language('/c/pl/cześć')
    'pl'
    >>> get_uri_language('/x/en/able')
    'en'
    """
    if uri.startswith('/a/'):
        return get_uri_language(parse_possible_compound_uri('a', uri)[1])
    elif is_term(uri):
        return split_uri(uri)[1]
    else:
        return None

jlowryduda's avatar
jlowryduda committed
378
379
380

def uri_to_label(uri):
    """
381
382
383
384
    Convert a ConceptNet uri into a label to be used in nodes. This function
    replaces an underscore with a space, so while '/c/en/example' will be
    converted into 'example', '/c/en/canary_islands' will be converted into
    'canary islands'.
385

jlowryduda's avatar
jlowryduda committed
386
387
388
389
    >>> uri_to_label('/c/en/example')
    'example'
    >>> uri_to_label('/c/en/canary_islands')
    'canary islands'
390
391
    >>> uri_to_label('/c/en')
    ''
392
393
    >>> uri_to_label('/r/RelatedTo')
    'RelatedTo'
394
395
    >>> uri_to_label('http://wikidata.dbpedia.org/resource/Q89')
    'Q89'
jlowryduda's avatar
jlowryduda committed
396
    """
397
398
    if is_absolute_url(uri):
        return uri.split('/')[-1]
jlowryduda's avatar
jlowryduda committed
399
400
    if is_term(uri):
        uri = uri_prefix(uri)
401
    parts = split_uri(uri)
402
    if len(parts) < 3 and not is_relation(uri):
403
404
        return ''
    return parts[-1].replace('_', ' ')
jlowryduda's avatar
jlowryduda committed
405
406


Rob Speer's avatar
Rob Speer committed
407
class Licenses:
408
409
    cc_attribution = 'cc:by/4.0'
    cc_sharealike = 'cc:by-sa/4.0'