Skip to content

Commit

Permalink
Merge pull request #2752 from flairNLP/GH-2722-span-dataset
Browse files Browse the repository at this point in the history
GH-2722: Make Span detection more robust
  • Loading branch information
alanakbik authored May 7, 2022
2 parents 27e6c41 + 83f7106 commit a1732bc
Show file tree
Hide file tree
Showing 8 changed files with 273 additions and 16 deletions.
50 changes: 34 additions & 16 deletions flair/datasets/sequence_labeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -572,16 +572,30 @@ def _identify_span_columns(self, column_name_map, skip_first_line):
if skip_first_line:
file.readline()

sentence_1 = self._convert_lines_to_sentence(
self._read_next_sentence(file), word_level_tag_columns=column_name_map
)
# sentence_2 = self._convert_lines_to_sentence(self._read_next_sentence(file),
# word_level_tag_columns=column_name_map)
# check the first 5 sentences
probe = []
for i in range(5):
sentence = self._convert_lines_to_sentence(
self._read_next_sentence(file), word_level_tag_columns=column_name_map
)
if sentence:
probe.append(sentence)
else:
break

for sentence in [sentence_1]:
# go through all annotations
# go through all annotations and identify word- and span-level annotations
# - if a column has at least one BIES we know it's a Span label
# - if a column has at least one tag that is not BIOES, we know it's a Token label
# - problem cases are columns for which we see only O - in this case we default to Span
for sentence in probe:
for column in column_name_map:
if column == self.text_column or column == self.head_id_column:

# skip assigned columns
if (
column in self.word_level_tag_columns
or column in self.span_level_tag_columns
or column == self.head_id_column
):
continue

layer = column_name_map[column]
Expand All @@ -596,16 +610,20 @@ def _identify_span_columns(self, column_name_map, skip_first_line):
continue

for token in sentence:
if token.get_label(layer, "O").value != "O" and token.get_label(layer).value[0:2] not in [
"B-",
"I-",
"E-",
"S-",
]:
# if at least one token has a BIES, we know it's a span label
if token.get_label(layer).value[0:2] in ["B-", "I-", "E-", "S-"]:
self.span_level_tag_columns[column] = layer
break

# if at least one token has a label other than BIOES, we know it's a token label
elif token.get_label(layer, "O").value != "O":
self.word_level_tag_columns[column] = layer
break
if column not in self.word_level_tag_columns:
self.span_level_tag_columns[column] = layer

# all remaining columns that are not word-level are span-level
for column in column_name_map:
if column not in self.word_level_tag_columns:
self.span_level_tag_columns[column] = column_name_map[column]

for column in self.span_level_tag_columns:
log.debug(f"Column {column} ({self.span_level_tag_columns[column]}) is a span-level column.")
Expand Down
6 changes: 6 additions & 0 deletions tests/resources/tasks/span_labels/span_first.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Vgl. O
Rundschreiben O
RAB PARTA
1/2010 YEAR
Rz MISC
8. MISC
8 changes: 8 additions & 0 deletions tests/resources/tasks/span_labels/span_second.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
-DOCSTART-

Vgl. O
Rundschreiben O
RAB PARTA
1/2010 YEAR
Rz MISC
8. MISC
10 changes: 10 additions & 0 deletions tests/resources/tasks/span_labels/span_third.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
-DOCSTART-

Rundschreiben O

Vgl. O
Rundschreiben O
RAB PARTA
1/2010 YEAR
Rz MISC
8. MISC
32 changes: 32 additions & 0 deletions tests/resources/tasks/up_english/en_ewt-up-dev.conllu
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# newdoc id = weblog-blogspot.com_nominations_20041117172713_ENG_20041117_172713
# sent_id = weblog-blogspot.com_nominations_20041117172713_ENG_20041117_172713-0001
# text = From the AP comes this story :
1 From from ADP IN _ 3 case 3:case _ _ _
2 the the DET DT Definite=Def|PronType=Art 3 det 3:det _ _ _
3 AP AP PROPN NNP Number=Sing 4 obl 4:obl:from _ _ ARG2
4 comes come VERB VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 0 root 0:root _ come.03 V
5 this this DET DT Number=Sing|PronType=Dem 6 det 6:det _ _ _
6 story story NOUN NN Number=Sing 4 nsubj 4:nsubj _ _ ARG1
7 : : PUNCT : _ 4 punct 4:punct _ _ _

# sent_id = weblog-blogspot.com_nominations_20041117172713_ENG_20041117_172713-0002
# text = President Bush on Tuesday nominated two individuals to replace retiring jurists on federal courts in the Washington area.
1 President President PROPN NNP Number=Sing 5 nsubj 5:nsubj _ _ ARG0 _
2 Bush Bush PROPN NNP Number=Sing 1 flat 1:flat _ _ _ _
3 on on ADP IN _ 4 case 4:case _ _ _ _
4 Tuesday Tuesday PROPN NNP Number=Sing 5 obl 5:obl:on _ _ ARGM-TMP _
5 nominated nominate VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 0 root 0:root _ nominate.01 V _
6 two two NUM CD NumType=Card 7 nummod 7:nummod _ _ _ _
7 individuals individual NOUN NNS Number=Plur 5 obj 5:obj _ _ ARG1 ARG0
8 to to PART TO _ 9 mark 9:mark _ _ _ _
9 replace replace VERB VB VerbForm=Inf 5 advcl 5:advcl:to _ replace.01 ARG2 V
10 retiring retire VERB VBG VerbForm=Ger 11 amod 11:amod _ _ _ _
11 jurists jurist NOUN NNS Number=Plur 9 obj 9:obj _ _ _ ARG1
12 on on ADP IN _ 14 case 14:case _ _ _ _
13 federal federal ADJ JJ Degree=Pos 14 amod 14:amod _ _ _ _
14 courts court NOUN NNS Number=Plur 11 nmod 11:nmod:on _ _ _ _
15 in in ADP IN _ 18 case 18:case _ _ _ _
16 the the DET DT Definite=Def|PronType=Art 18 det 18:det _ _ _ _
17 Washington Washington PROPN NNP Number=Sing 18 compound 18:compound _ _ _ _
18 area area NOUN NN Number=Sing 14 nmod 14:nmod:in SpaceAfter=No _ _ _
19 . . PUNCT . _ 5 punct 5:punct _ _ _ _
36 changes: 36 additions & 0 deletions tests/resources/tasks/up_english/en_ewt-up-test.conllu
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# newdoc id = weblog-blogspot.com_zentelligence_20040423000200_ENG_20040423_000200
# sent_id = weblog-blogspot.com_zentelligence_20040423000200_ENG_20040423_000200-0001
# text = What if Google Morphed Into GoogleOS?
1 What what PRON WP PronType=Int 0 root 0:root _ _ _
2 if if SCONJ IN _ 4 mark 4:mark _ _ _
3 Google Google PROPN NNP Number=Sing 4 nsubj 4:nsubj _ _ ARG1
4 Morphed morph VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 1 advcl 1:advcl:if _ morph.01 V
5 Into into ADP IN _ 6 case 6:case _ _ _
6 GoogleOS GoogleOS PROPN NNP Number=Sing 4 obl 4:obl:into SpaceAfter=No _ ARG2
7 ? ? PUNCT . _ 4 punct 4:punct _ _ _

# sent_id = weblog-blogspot.com_zentelligence_20040423000200_ENG_20040423_000200-0002
# text = What if Google expanded on its search-engine (and now e-mail) wares into a full-fledged operating system?
1 What what PRON WP PronType=Int 0 root 0:root _ _ _
2 if if SCONJ IN _ 4 mark 4:mark _ _ _
3 Google Google PROPN NNP Number=Sing 4 nsubj 4:nsubj _ _ ARG0
4 expanded expand VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 1 advcl 1:advcl:if _ expand.01 V
5 on on ADP IN _ 15 case 15:case _ _ _
6 its its PRON PRP$ Gender=Neut|Number=Sing|Person=3|Poss=Yes|PronType=Prs 15 nmod:poss 15:nmod:poss _ _ _
7 search search NOUN NN Number=Sing 9 compound 9:compound SpaceAfter=No _ _
8 - - PUNCT HYPH _ 9 punct 9:punct SpaceAfter=No _ _
9 engine engine NOUN NN Number=Sing 15 compound 15:compound _ _ _
10 ( ( PUNCT -LRB- _ 9 punct 9:punct SpaceAfter=No _ _
11 and and CCONJ CC _ 13 cc 13:cc _ _ _
12 now now ADV RB _ 13 advmod 13:advmod _ _ _
13 e-mail e-mail NOUN NN Number=Sing 9 conj 9:conj:and|15:compound SpaceAfter=No _ _
14 ) ) PUNCT -RRB- _ 15 punct 15:punct _ _ _
15 wares wares NOUN NNS Number=Plur 4 obl 4:obl:on _ _ ARG1
16 into into ADP IN _ 22 case 22:case _ _ _
17 a a DET DT Definite=Ind|PronType=Art 22 det 22:det _ _ _
18 full full ADV RB _ 20 advmod 20:advmod SpaceAfter=No _ _
19 - - PUNCT HYPH _ 20 punct 20:punct SpaceAfter=No _ _
20 fledged fledged ADJ JJ Degree=Pos 22 amod 22:amod _ _ _
21 operating operating NOUN NN Number=Sing 22 compound 22:compound _ _ _
22 system system NOUN NN Number=Sing 4 obl 4:obl:into SpaceAfter=No _ ARG4
23 ? ? PUNCT . _ 4 punct 4:punct _ _ _
92 changes: 92 additions & 0 deletions tests/resources/tasks/up_english/en_ewt-up-train.conllu
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
# newdoc id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000
# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0001
# text = Al-Zaman : American forces killed Shaikh Abdullah al-Ani, the preacher at the mosque in the town of Qaim, near the Syrian border.
1 Al Al PROPN NNP Number=Sing 0 root 0:root SpaceAfter=No _ _
2 - - PUNCT HYPH _ 1 punct 1:punct SpaceAfter=No _ _
3 Zaman Zaman PROPN NNP Number=Sing 1 flat 1:flat _ _ _
4 : : PUNCT : _ 1 punct 1:punct _ _ _
5 American american ADJ JJ Degree=Pos 6 amod 6:amod _ _ _
6 forces force NOUN NNS Number=Plur 7 nsubj 7:nsubj _ _ ARG0
7 killed kill VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 1 parataxis 1:parataxis _ kill.01 V
8 Shaikh Shaikh PROPN NNP Number=Sing 7 obj 7:obj _ _ ARG1
9 Abdullah Abdullah PROPN NNP Number=Sing 8 flat 8:flat _ _ _
10 al al PROPN NNP Number=Sing 8 flat 8:flat SpaceAfter=No _ _
11 - - PUNCT HYPH _ 8 punct 8:punct SpaceAfter=No _ _
12 Ani Ani PROPN NNP Number=Sing 8 flat 8:flat SpaceAfter=No _ _
13 , , PUNCT , _ 8 punct 8:punct _ _ _
14 the the DET DT Definite=Def|PronType=Art 15 det 15:det _ _ _
15 preacher preacher NOUN NN Number=Sing 8 appos 8:appos _ _ _
16 at at ADP IN _ 18 case 18:case _ _ _
17 the the DET DT Definite=Def|PronType=Art 18 det 18:det _ _ _
18 mosque mosque NOUN NN Number=Sing 7 obl 7:obl:at _ _ ARGM-LOC
19 in in ADP IN _ 21 case 21:case _ _ _
20 the the DET DT Definite=Def|PronType=Art 21 det 21:det _ _ _
21 town town NOUN NN Number=Sing 18 nmod 18:nmod:in _ _ _
22 of of ADP IN _ 23 case 23:case _ _ _
23 Qaim Qaim PROPN NNP Number=Sing 21 nmod 21:nmod:of SpaceAfter=No _ _
24 , , PUNCT , _ 21 punct 21:punct _ _ _
25 near near ADP IN _ 28 case 28:case _ _ _
26 the the DET DT Definite=Def|PronType=Art 28 det 28:det _ _ _
27 Syrian syrian ADJ JJ Degree=Pos 28 amod 28:amod _ _ _
28 border border NOUN NN Number=Sing 21 nmod 21:nmod:near SpaceAfter=No _ _
29 . . PUNCT . _ 1 punct 1:punct _ _ _

# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0002
# text = [This killing of a respected cleric will be causing us trouble for years to come.]
1 [ [ PUNCT -LRB- _ 10 punct 10:punct SpaceAfter=No _ _ _ _ _
2 This this DET DT Number=Sing|PronType=Dem 3 det 3:det _ _ _ _ _ _
3 killing killing NOUN NN Number=Sing 10 nsubj 10:nsubj _ kill.01 V _ ARG0 _
4 of of ADP IN _ 7 case 7:case _ _ _ _ _ _
5 a a DET DT Definite=Ind|PronType=Art 7 det 7:det _ _ _ _ _ _
6 respected respected ADJ JJ Degree=Pos 7 amod 7:amod _ _ _ _ _ _
7 cleric cleric NOUN NN Number=Sing 3 nmod 3:nmod:of _ _ ARG1 _ _ _
8 will will AUX MD VerbForm=Fin 10 aux 10:aux _ _ _ _ ARGM-MOD _
9 be be AUX VB VerbForm=Inf 10 aux 10:aux _ be.03 _ V _ _
10 causing cause VERB VBG VerbForm=Ger 0 root 0:root _ cause.01 _ _ V _
11 us we PRON PRP Case=Acc|Number=Plur|Person=1|PronType=Prs 10 iobj 10:iobj _ _ _ _ ARGM-GOL _
12 trouble trouble NOUN NN Number=Sing 10 obj 10:obj _ _ _ _ ARG1 _
13 for for ADP IN _ 14 case 14:case _ _ _ _ _ _
14 years year NOUN NNS Number=Plur 10 obl 10:obl:for _ _ _ _ ARGM-TMP ARG1
15 to to PART TO _ 16 mark 16:mark _ _ _ _ _ _
16 come come VERB VB VerbForm=Inf 14 acl 14:acl:to SpaceAfter=No come.01 _ _ _ V
17 . . PUNCT . _ 10 punct 10:punct SpaceAfter=No _ _ _ _ _
18 ] ] PUNCT -RRB- _ 10 punct 10:punct _ _ _ _ _ _

# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0003
# text = DPA: Iraqi authorities announced that they had busted up 3 terrorist cells operating in Baghdad.
1 DPA DPA PROPN NNP Number=Sing 0 root 0:root SpaceAfter=No _ _ _ _ _
2 : : PUNCT : _ 1 punct 1:punct _ _ _ _ _ _
3 Iraqi iraqi ADJ JJ Degree=Pos 4 amod 4:amod _ _ _ _ _ _
4 authorities authority NOUN NNS Number=Plur 5 nsubj 5:nsubj _ _ ARG0 _ _ _
5 announced announce VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 1 parataxis 1:parataxis _ announce.01 V _ _ _
6 that that SCONJ IN _ 9 mark 9:mark _ _ _ _ _ _
7 they they PRON PRP Case=Nom|Number=Plur|Person=3|PronType=Prs 9 nsubj 9:nsubj _ _ _ _ ARG0 _
8 had have AUX VBD Mood=Ind|Tense=Past|VerbForm=Fin 9 aux 9:aux _ have.01 _ V _ _
9 busted bust VERB VBN Tense=Past|VerbForm=Part 5 ccomp 5:ccomp _ bust_up.04 ARG1 _ V _
10 up up ADP RP _ 9 compound:prt 9:compound:prt _ _ _ _ _ _
11 3 3 NUM CD NumType=Card 13 nummod 13:nummod _ _ _ _ _ _
12 terrorist terrorist ADJ JJ Degree=Pos 13 amod 13:amod _ _ _ _ _ _
13 cells cell NOUN NNS Number=Plur 9 obj 9:obj _ _ _ _ ARG1 ARG0
14 operating operate VERB VBG VerbForm=Ger 13 acl 13:acl _ operate.01 _ _ _ V
15 in in ADP IN _ 16 case 16:case _ _ _ _ _ _
16 Baghdad Baghdad PROPN NNP Number=Sing 14 obl 14:obl:in SpaceAfter=No _ _ _ _ ARGM-LOC
17 . . PUNCT . _ 1 punct 1:punct _ _ _ _ _ _

# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0004
# text = Two of them were being run by 2 officials of the Ministry of the Interior!
1 Two two NUM CD NumType=Card 6 nsubj:pass 6:nsubj:pass _ _ _ _ ARG1
2 of of ADP IN _ 3 case 3:case _ _ _ _ _
3 them they PRON PRP Case=Acc|Number=Plur|Person=3|PronType=Prs 1 nmod 1:nmod:of _ _ _ _ _
4 were be AUX VBD Mood=Ind|Tense=Past|VerbForm=Fin 6 aux 6:aux _ be.03 V _ _
5 being be AUX VBG VerbForm=Ger 6 aux:pass 6:aux:pass _ be.03 _ V _
6 run run VERB VBN Tense=Past|VerbForm=Part|Voice=Pass 0 root 0:root _ run.01 _ _ V
7 by by ADP IN _ 9 case 9:case _ _ _ _ _
8 2 2 NUM CD NumType=Card 9 nummod 9:nummod _ _ _ _ _
9 officials official NOUN NNS Number=Plur 6 obl 6:obl:by _ _ _ _ ARG0
10 of of ADP IN _ 12 case 12:case _ _ _ _ _
11 the the DET DT Definite=Def|PronType=Art 12 det 12:det _ _ _ _ _
12 Ministry Ministry PROPN NNP Number=Sing 9 nmod 9:nmod:of _ _ _ _ _
13 of of ADP IN _ 15 case 15:case _ _ _ _ _
14 the the DET DT Definite=Def|PronType=Art 15 det 15:det _ _ _ _ _
15 Interior Interior PROPN NNP Number=Sing 12 nmod 12:nmod:of SpaceAfter=No _ _ _ _
16 ! ! PUNCT . _ 6 punct 6:punct _ _ _ _ _
55 changes: 55 additions & 0 deletions tests/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,38 @@ def test_load_column_corpus_options(tasks_base_path):
assert corpus.train[0].to_tokenized_string() == "This is New Berlin"


def test_load_span_data(tasks_base_path):
# load column dataset with one entry
dataset = flair.datasets.ColumnDataset(
tasks_base_path / "span_labels" / "span_first.txt",
column_name_map={0: "text", 1: "ner"},
)

assert len(dataset) == 1
assert dataset[0][2].text == "RAB"
assert dataset[0][2].get_label("ner").value == "PARTA"

# load column dataset with two entries
dataset = flair.datasets.ColumnDataset(
tasks_base_path / "span_labels" / "span_second.txt",
column_name_map={0: "text", 1: "ner"},
)

assert len(dataset) == 2
assert dataset[1][2].text == "RAB"
assert dataset[1][2].get_label("ner").value == "PARTA"

# load column dataset with three entries
dataset = flair.datasets.ColumnDataset(
tasks_base_path / "span_labels" / "span_third.txt",
column_name_map={0: "text", 1: "ner"},
)

assert len(dataset) == 3
assert dataset[2][2].text == "RAB"
assert dataset[2][2].get_label("ner").value == "PARTA"


def test_load_germeval_data(tasks_base_path):
# get training, test and dev data
corpus = flair.datasets.ColumnCorpus(tasks_base_path / "ner_german_germeval", column_format={0: "text", 2: "ner"})
Expand All @@ -125,6 +157,29 @@ def test_load_ud_english_data(tasks_base_path):
assert len(corpus.test) == 4
assert len(corpus.dev) == 2

# check if Token labels are correct
sentence = corpus.train[0]
assert sentence[0].text == "From"
assert sentence[0].get_label("upos").value == "ADP"
assert sentence[1].text == "the"
assert sentence[1].get_label("upos").value == "DET"


def test_load_up_english_data(tasks_base_path):
# get training, test and dev data
corpus = flair.datasets.UP_ENGLISH(tasks_base_path)

assert len(corpus.train) == 4
assert len(corpus.test) == 2
assert len(corpus.dev) == 2

# check if Token labels for frames are correct
sentence = corpus.dev[0]
assert sentence[2].text == "AP"
assert sentence[2].get_label("frame", zero_tag_value="no_label").value == "no_label"
assert sentence[3].text == "comes"
assert sentence[3].get_label("frame").value == "come.03"


def test_load_no_dev_data(tasks_base_path):
# get training, test and dev data
Expand Down

0 comments on commit a1732bc

Please sign in to comment.