-
Notifications
You must be signed in to change notification settings - Fork 0
/
util.py
131 lines (110 loc) · 4.7 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import json
import requests
import numpy as np
from overrides import overrides
from allennlp.common.checks import ConfigurationError
from allennlp.common.file_utils import cached_path
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.dataset_readers.dataset_utils import to_bioul
from allennlp.data.fields import TextField, SequenceLabelField, Field, MetadataField
from allennlp.data.instance import Instance
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Token
URL = 'http://ltdemos.informatik.uni-hamburg.de/cam-api'
proxies = {
"http": "http://165.225.66.34:10015/",
"https": "https://165.225.66.34:10015/",
}
def get_response(first_object, second_object, fast_search=True,
aspects=None, weights=None):
num_aspects = len(aspects) if aspects is not None else 0
num_weights = len(weights) if weights is not None else 0
if num_aspects != num_weights:
raise ValueError(
"Number of weights should be equal to the number of aspects")
params = {
'objectA': first_object,
'objectB': second_object,
'fs': str(fast_search).lower()
}
if num_aspects:
params.update({'aspect{}'.format(i + 1): aspect
for i, aspect in enumerate(aspects)})
params.update({'weight{}'.format(i + 1): weight
for i, weight in enumerate(weights)})
response = requests.get(url=URL, params=params, proxies=proxies)
return response
def _is_divider(line: str) -> bool:
empty_line = line.strip() == ""
if empty_line:
return True
else:
first_token = line.split()[0]
if first_token == "-DOCSTART-":
return True
else:
return False
class ConllUniversalReader(DatasetReader):
def __init__(
self,
token_indexers: Dict[str, TokenIndexer] = None,
tag_index: int = 0,
coding_scheme: str = "IOB1",
label_namespace: str = "labels",
**kwargs,
) -> None:
super().__init__(**kwargs)
self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
if coding_scheme not in ("IOB1", "BIOUL"):
raise ConfigurationError("unknown coding_scheme: {}".format(coding_scheme))
self.tag_index = tag_index
self.coding_scheme = coding_scheme
self.label_namespace = label_namespace
self._original_coding_scheme = "IOB1"
@overrides
def _read(self, file_path: str) -> Iterable[Instance]:
# if `file_path` is a URL, redirect to the cache
file_path = cached_path(file_path)
with open(file_path, "r") as data_file:
logger.info("Reading instances from lines in file at: %s", file_path)
# Group into alternative divider / sentence chunks.
for is_divider, lines in itertools.groupby(data_file, _is_divider):
# Ignore the divider chunks, so that `lines` corresponds to the words
# of a single sentence.
if not is_divider:
fields = [line.strip().split() for line in lines]
# unzipping trick returns tuples, but our Fields need lists
fields = [list(field) for field in zip(*fields)]
tokens_ = fields[0]
if self.tag_index >= 0:
ner_tags = fields[1:][self.tag_index]
else:
ner_tags = None
# TextField requires `Token` objects
tokens = [Token(token) for token in tokens_]
yield self.text_to_instance(tokens, ner_tags)
def text_to_instance( # type: ignore
self,
tokens: List[Token],
ner_tags: List[str] = None,
) -> Instance:
"""
We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
"""
sequence = TextField(tokens, self._token_indexers)
instance_fields: Dict[str, Field] = {"tokens": sequence}
instance_fields["metadata"] = MetadataField({"words": [x.text for x in tokens]})
# Recode the labels if necessary.
if self.coding_scheme == "BIOUL":
coded_ner = (
to_bioul(ner_tags, encoding=self._original_coding_scheme)
if ner_tags is not None
else None
)
else:
# the default IOB1
coded_ner = ner_tags
# Add "tag label" to instance
if coded_ner:
instance_fields["tags"] = SequenceLabelField(coded_ner, sequence, self.label_namespace)
return Instance(instance_fields)