combine_assertions.py 4.46 KB
Newer Older
1
from __future__ import unicode_literals, print_function
jlowryduda's avatar
jlowryduda committed
2

Rob Speer's avatar
Rob Speer committed
3
import itertools
4
import json
jlowryduda's avatar
jlowryduda committed
5

6
import os
jlowryduda's avatar
jlowryduda committed
7
8
9
10
11
12
13

from conceptnet5.edges import make_edge
from conceptnet5.formats.msgpack_stream import MsgpackStreamWriter
from conceptnet5.languages import ALL_LANGUAGES
from conceptnet5.readers.wiktionary import valid_language
from conceptnet5.uri import conjunction_uri,get_uri_language, is_absolute_url, Licenses, \
    split_uri, uri_prefix
14
from conceptnet5.util import get_support_data_filename
15
16
17
18

N = 100
CURRENT_DIR = os.getcwd()

19

20
21
22
23
24
def get_blacklist():
    filename = get_support_data_filename('blacklist.txt')
    return set(open(filename).readlines())


25
26
def weight_scale(weight):
    """
27
    This scale starts out linear, then switches to a square-root scale at x=2.
28

29
30
31
32
33
34
35
36
37
38
39
40
    >>> weight_scale(-1)
    -1.0
    >>> weight_scale(0)
    0.0
    >>> weight_scale(1)
    1.0
    >>> weight_scale(2)
    2.0
    >>> weight_scale(5)
    4.0
    >>> weight_scale(10)
    6.0
41
    """
42
    return 2 * max(weight - 1, 1) ** .5 + min(weight, 2) - 2
43
44


Rob Speer's avatar
Rob Speer committed
45
def keep_concept(uri):
46
47
    # FIXME: possibly we should use the 'is_valid_concept' check that we use
    # elsewhere
48
49
    if is_absolute_url(uri):
        return True
Rob Speer's avatar
Rob Speer committed
50
51
    if get_uri_language(uri) not in ALL_LANGUAGES:
        return False
52
53
    if not valid_language(get_uri_language(uri)):
        return False
Rob Speer's avatar
Rob Speer committed
54
55
56
57
    pieces = split_uri(uri)
    return bool(pieces[2])


Rob Speer's avatar
Rob Speer committed
58
59
60
61
62
63
def make_assertion(line_group):
    lines = [line.rstrip() for line in line_group]
    lines = [line for line in lines if line]
    if not lines:
        return None

64
65
    # FIXME: the steps leading up to this produce URIs that can differ based
    # on word senses. These don't get merged together, but they should.
Rob Speer's avatar
Rob Speer committed
66
    uri, rel, start, end, _ = lines[0].split('\t')
67
68
69
70
71
72

    # We can't distinguish word senses well enough yet, so only keep them
    # up to the part of speech
    start = uri_prefix(start, 4)
    end = uri_prefix(end, 4)

Rob Speer's avatar
Rob Speer committed
73
    if not (keep_concept(start) and keep_concept(end)):
Rob Speer's avatar
Rob Speer committed
74
75
76
77
78
79
80
81
        return None

    info_dicts = [json.loads(line.split('\t')[4]) for line in lines]
    unscaled_weight = sum(info['weight'] for info in info_dicts)
    licenses = {info['license'] for info in info_dicts}
    dataset = info_dicts[0]['dataset']
    surface_text = None
    sources = []
Rob Speer's avatar
Rob Speer committed
82
    seen_sources = set()
Rob Speer's avatar
Rob Speer committed
83
84
85
    for info in info_dicts:
        if surface_text is None and 'surfaceText' in info:
            surface_text = info['surfaceText']
Rob Speer's avatar
Rob Speer committed
86
        for subsource in info['sources']:
87
            conjunction = conjunction_uri(*sorted(subsource.values()))
Rob Speer's avatar
Rob Speer committed
88
89
90
            if conjunction not in seen_sources:
                sources.append(subsource)
                seen_sources.add(conjunction)
Rob Speer's avatar
Rob Speer committed
91
92
93
94
95
96
97
98
99
100
101
102
103
104

    weight = weight_scale(unscaled_weight)
    if Licenses.cc_sharealike in licenses:
        license = Licenses.cc_sharealike
    else:
        license = Licenses.cc_attribution

    return make_edge(
        rel=rel, start=start, end=end, weight=weight,
        dataset=dataset, license=license, sources=sources,
        surfaceText=surface_text
    )


105
def combine_assertions(input_filename, output_filename):
106
    """
107
108
    Take in a tab-separated, sorted "CSV" files, indicated by
    `input_filename`, that should be grouped together into assertions.
109
110
    Output a msgpack stream of assertions the file indicated by
    `output_filename`.
111

112
113
114
    The input file should be made from multiple sources of assertions by
    concatenating and sorting them.

115
    The combined assertions will all have the dataset of the first edge that
Rob Speer's avatar
Rob Speer committed
116
    produces them, and the license of the strongest license being combined.
117
118
119
120

    This process requires its input to be a sorted CSV so that all edges for
    the same assertion will appear consecutively.
    """
Rob Speer's avatar
Rob Speer committed
121
122
123
124
    def group_func(line):
        "Group lines by their URI (their first column)."
        return line.split('\t', 1)[0]

125
126
    out = MsgpackStreamWriter(output_filename)
    out_bad = MsgpackStreamWriter(output_filename + '.reject')
Rob Speer's avatar
Rob Speer committed
127

128
129
    blacklist = get_blacklist()

130
131
132
    with open(input_filename, encoding='utf-8') as stream:
        for key, line_group in itertools.groupby(stream, group_func):
            assertion = make_assertion(line_group)
133
            destination = out
134
135
            if assertion is None:
                continue
136
            if assertion['weight'] <= 0:
137
                destination = out_bad
138
139
140
            for value in assertion.values():
                if isinstance(value, str) and value in blacklist:
                    destination = out_bad
141
            destination.write(assertion)
Rob Speer's avatar
Rob Speer committed
142
143
144

    out.close()
    out_bad.close()