-
Notifications
You must be signed in to change notification settings - Fork 1
/
sequence_labelling.py
139 lines (114 loc) · 3.21 KB
/
sequence_labelling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
"""Convert the FGCR dataset json to the format that the ACE model accepts.
Note: ACE doesn't support classification, so we're only generating the cause and
effect labels here.
Example input:
```json
{
"tid": 2771,
"info": "If one or more of Ecolab's customers were to experience a disastrous outcome, the firm's reputation could suffer and it could lose multiple customers as a result.", # noqa
"extraInfo": null,
"labelData": [
{
"type": "cause",
"reason": [
[
3,
76
]
],
"result": [
[
78,
149
]
]
}
]
}
```
Example output:
```
If O
one B-Cause
or I-Cause
more I-Cause
of I-Cause
Ecolab I-Cause
's I-Cause
customers I-Cause
were I-Cause
to I-Cause
experience I-Cause
a I-Cause
disastrous I-Cause
outcome I-Cause
, O
the B-Effect
firm I-Effect
's I-Effect
reputation I-Effect
could I-Effect
suffer I-Effect
and I-Effect
it I-Effect
could I-Effect
lose I-Effect
multiple I-Effect
customers I-Effect
as O
a O
result O
. O
```
"""
from __future__ import annotations
import json
from pathlib import Path
from typing import Any
from nltk.tokenize import NLTKWordTokenizer
def convert_instance(instance: dict[str, Any]) -> list[tuple[str, str]]:
"""Convert a FGCR-format instance into an ACE-format instance.
This ignores the relationship and only annotates the causes and effects using
B-Cause, I-Cause, B-Effect, I-Effect and O.
This happens at the token level, so we tokenise the text using the
NLTKWordTokenizer. The output is a list of (token, label) pairs.
"""
text = instance["info"]
spans = list(NLTKWordTokenizer().span_tokenize(text))
label_map = {"reason": "Cause", "result": "Effect"}
labels = {}
for label_data in instance["labelData"]:
for ev_type in ["reason", "result"]:
for ev_start, ev_end in label_data[ev_type]:
is_first = True
for t_start, t_end in spans:
if ev_start <= t_start and t_end <= ev_end:
tag = "B" if is_first else "I"
is_first = False
labels[(t_start, t_end)] = f"{tag}-{label_map[ev_type]}"
out = []
for start, end in spans:
token = text[start:end]
label = labels.get((start, end), "O")
out.append((token, label))
return out
def format_instance(instances: list[tuple[str, str]]) -> str:
return "\n".join(f"{token} {label}" for token, label in instances)
def convert_file(infile: Path, outfile: Path) -> None:
with infile.open() as f:
dataset = json.load(f)
instances = [convert_instance(instance) for instance in dataset]
converted = "\n\n".join(format_instance(i) for i in instances)
outfile.parent.mkdir(exist_ok=True, parents=True)
with outfile.open("w") as f:
print(converted, file=f)
def main() -> None:
raw_folder = Path("../data/raw")
new_folder = Path("./sequence_labelling")
splits = ["dev", "test", "train"]
for split in splits:
raw_path = raw_folder / f"event_dataset_{split}.json"
new_path = new_folder / f"{split}.txt"
convert_file(raw_path, new_path)
if __name__ == "__main__":
main()