Skip to content

Commit

Permalink
Merge pull request #40 from allenai/convert-spike-doc
Browse files Browse the repository at this point in the history
Spike doc instead of odin doc
  • Loading branch information
aryehgigi authored May 1, 2022
2 parents 3d03816 + 987fc81 commit e17e1bb
Show file tree
Hide file tree
Showing 4 changed files with 78 additions and 183 deletions.
37 changes: 23 additions & 14 deletions pybart/api.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import math

from .conllu_wrapper import parse_conllu, serialize_conllu, parse_odin, conllu_to_odin, parsed_tacred_json
from .conllu_wrapper import parse_conllu, serialize_conllu, parse_spike_sentence, conllu_to_spike, parsed_tacred_json
from .converter import Convert, get_conversion_names as inner_get_conversion_names, init_conversions
from spacy.language import Language
from .spacy_wrapper import parse_spacy_sent, enhance_to_spacy_doc
Expand All @@ -13,21 +13,30 @@ def convert_bart_conllu(conllu_text, enhance_ud=True, enhanced_plus_plus=True, e
return serialize_conllu(converted, all_comments, remove_eud_info, remove_extra_info, preserve_comments)


def _convert_bart_odin_sent(doc, enhance_ud, enhanced_plus_plus, enhanced_extra, conv_iterations, remove_eud_info, remove_extra_info, remove_node_adding_conversions, remove_unc, query_mode, funcs_to_cancel, ud_version):
sents = parse_odin(doc)
def _convert_spike_paragraph(paragraph, enhance_ud, enhanced_plus_plus, enhanced_extra, conv_iterations, remove_eud_info, remove_extra_info, remove_node_adding_conversions, remove_unc, query_mode, funcs_to_cancel, ud_version):
sents = [parse_spike_sentence(sent) for sent in paragraph['sentences']]
con = Convert(sents, enhance_ud, enhanced_plus_plus, enhanced_extra, conv_iterations, remove_eud_info, remove_extra_info, remove_node_adding_conversions, remove_unc, query_mode, funcs_to_cancel, ud_version)
converted_sents, _ = con()
return conllu_to_odin(converted_sents, doc, remove_eud_info, remove_extra_info)


def convert_bart_odin(odin_json, enhance_ud=True, enhanced_plus_plus=True, enhanced_extra=True, conv_iterations=math.inf, remove_eud_info=False, remove_extra_info=False, remove_node_adding_conversions=False, remove_unc=False, query_mode=False, funcs_to_cancel=None, ud_version=1):
if "documents" in odin_json:
for doc_key, doc in odin_json["documents"].items():
odin_json["documents"][doc_key] = _convert_bart_odin_sent(doc, enhance_ud, enhanced_plus_plus, enhanced_extra, conv_iterations, remove_eud_info, remove_extra_info, remove_node_adding_conversions, remove_unc, query_mode, funcs_to_cancel, ud_version)
else:
odin_json = _convert_bart_odin_sent(odin_json, enhance_ud, enhanced_plus_plus, enhanced_extra, conv_iterations, remove_eud_info, remove_extra_info, remove_node_adding_conversions, remove_unc, query_mode, funcs_to_cancel, ud_version)

return odin_json
# ATTENTION - overrides original json
conllu_to_spike(converted_sents, paragraph, remove_eud_info, remove_extra_info)


def _convert_spike_sections(section, enhance_ud, enhanced_plus_plus, enhanced_extra, conv_iterations, remove_eud_info, remove_extra_info, remove_node_adding_conversions, remove_unc, query_mode, funcs_to_cancel, ud_version):
for body in section["body"]:
if body["type"] == "section":
_convert_spike_sections(body, enhance_ud, enhanced_plus_plus, enhanced_extra, conv_iterations, remove_eud_info, remove_extra_info, remove_node_adding_conversions, remove_unc, query_mode, funcs_to_cancel, ud_version)
else:
_convert_spike_paragraph(body, enhance_ud, enhanced_plus_plus, enhanced_extra, conv_iterations, remove_eud_info, remove_extra_info, remove_node_adding_conversions, remove_unc, query_mode, funcs_to_cancel, ud_version)


def convert_spike_annh(spike_json, enhance_ud=True, enhanced_plus_plus=True, enhanced_extra=True, conv_iterations=math.inf, remove_eud_info=False, remove_extra_info=False, remove_node_adding_conversions=False, remove_unc=False, query_mode=False, funcs_to_cancel=None, ud_version=1):
# ASSUMPTION - SPIKE's annh structure jsonl where each line is a doc with:
# doc.body -> Array<Section> -> Section.body: Array<Paragraph|Section> -> Either:
# -> Section.body: Array<Paragraph|Section>
# -> Paragraph.sentences: Array<Sentence>
for section in spike_json["body"]:
_convert_spike_sections(section, enhance_ud, enhanced_plus_plus, enhanced_extra, conv_iterations, remove_eud_info, remove_extra_info, remove_node_adding_conversions, remove_unc, query_mode, funcs_to_cancel, ud_version)
return spike_json


def convert_bart_tacred(tacred_json, enhance_ud=True, enhanced_plus_plus=True, enhanced_extra=True, conv_iterations=math.inf, remove_eud_info=False, remove_extra_info=False, remove_node_adding_conversions=False, remove_unc=False, query_mode=False, funcs_to_cancel=None, ud_version=1):
Expand Down
205 changes: 39 additions & 166 deletions pybart/conllu_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,184 +90,57 @@ def serialize_conllu(converted, all_comments, remove_enhanced_extra_info, remove
return "\n".join(["\n".join(sent) + "\n" for sent in text])


# fw.conllu_to_odin(converter.convert(fw.parse_conllu(fw.odin_to_conllu(json_buf)[0])))
# or better off: fw.conllu_to_odin(converter.convert(fw.parse_odin(json_buf))))
def parse_odin(odin_json):
sentences = []
for sent in odin_json['sentences']:
sentence = list()
for i, (word, tag, lemma) in enumerate(zip(sent['words'], sent['tags'], sent['lemmas'])):
sentence.append(Token(TokenId(i + 1), word, lemma, "_", tag, "_", None, "_", "_", "_"))
for edge in sent['graphs']['universal-basic']['edges']:
sentence[edge['destination'] + 1].set_conllu_field('head', TokenId(edge['source'] + 1))
sentence[edge['destination'] + 1].set_conllu_field('deprel', edge['relation'])
for root in sent['graphs']['universal-basic']['roots']:
sentence[root + 1].set_conllu_field('head', TokenId(0))
sentence[root + 1].set_conllu_field('deprel', "root")
sentence.append(Token(TokenId(0), None, None, None, None, None, None, None, None, None))

add_basic_edges(sentence)
sentences.append(sentence)

return sentences


def _fix_sentence_keep_order(conllu_sentence):
sorted_sent = sorted(conllu_sentence)
addon = 0
fixed = list()

for token in sorted_sent:
iid = token.get_conllu_field("id")
if token.get_conllu_field("id").minor != 0:
if "CopyOf" in token.get_conllu_field("misc"):
token.set_conllu_field("form", token.get_conllu_field("form") + "[COPY_NODE]")
addon += 1

new_id = iid.major + addon
token.set_conllu_field("id", TokenId(new_id))
fixed.append(token)

return fixed


def _fix_sentence_push_to_end(conllu_sentence):
fixed = list()

for i, token in enumerate(conllu_sentence):
iid = token.get_conllu_field("id")
if iid.major == 0:
continue
if iid.get_conllue_field("id").major != 0:
token.set_conllu_field("id", TokenId(i + 1))

fixed.append(token)

return fixed


def fix_sentence(conllu_sentence, push_new_to_end=True):
if push_new_to_end:
return _fix_sentence_push_to_end(conllu_sentence)
def parse_spike_sentence(spike_sentence):
sent = spike_sentence
output = list()
for i, (word, pos, lemma) in enumerate(zip(sent['words'], sent['pos'], sent['lemmas'])):
output.append(Token(TokenId(i + 1), word, lemma, "_", pos, "_", None, "_", "_", "_"))
for edge in sent['graphs']['universal-basic']['edges']:
output[edge['child']].set_conllu_field('head', TokenId(edge['parent'] + 1))
output[edge['child']].set_conllu_field('deprel', edge['label'])
for root in sent['graphs']['universal-basic']['roots']:
output[root].set_conllu_field('head', TokenId(0))
output[root].set_conllu_field('deprel', "root")
output.append(Token(TokenId(0), None, None, None, None, None, None, None, None, None))

add_basic_edges(output)

return output


def fix_graph(conllu_sentence, spike_sentence, remove_enhanced_extra_info, remove_bart_extra_info):
if 'graphs' in spike_sentence:
spike_sentence["graphs"]["universal-enhanced"] = {"edges": [], "roots": []}
else:
return _fix_sentence_keep_order(conllu_sentence)


def fix_graph(conllu_sentence, odin_sentence, is_basic, remove_enhanced_extra_info, remove_bart_extra_info):
if is_basic:
odin_sentence["graphs"] = {"universal-basic": {"edges": [], "roots": []}}
else:
if 'graphs' in odin_sentence:
odin_sentence["graphs"]["universal-enhanced"] = {"edges": [], "roots": []}
else:
odin_sentence["graphs"] = {"universal-enhanced": {"edges": [], "roots": []}}
spike_sentence["graphs"] = {"universal-enhanced": {"edges": [], "roots": []}}

for iid, token in enumerate(conllu_sentence):
if token.get_conllu_field("id").major == 0:
continue

if is_basic:
if token.get_conllu_field("deprel").lower().startswith("root"):
odin_sentence["graphs"]["universal-basic"]["roots"].append(iid)
else:
odin_sentence["graphs"]["universal-basic"]["edges"].append(
{"source": token.get_conllu_field("head").major - 1, "destination": iid,
"relation": token.get_conllu_field("deprel")})
else:
for head, rels in token.get_new_relations():
for rel in rels:
if rel.to_str(remove_enhanced_extra_info, remove_bart_extra_info).lower().startswith("root"):
odin_sentence["graphs"]["universal-enhanced"]["roots"].append(iid)
else:
odin_sentence["graphs"]["universal-enhanced"]["edges"].append(
{"source": head.get_conllu_field("id").major - 1, "destination": iid, "relation": rel.to_str(remove_enhanced_extra_info, remove_bart_extra_info)})
for head, rels in token.get_new_relations():
for rel in rels:
if rel.to_str(remove_enhanced_extra_info, remove_bart_extra_info).lower().startswith("root"):
spike_sentence["graphs"]["universal-enhanced"]["roots"].append(iid)
else:
spike_sentence["graphs"]["universal-enhanced"]["edges"].append(
{"parent": head.get_conllu_field("id").major - 1, "child": iid, "label": rel.to_str(remove_enhanced_extra_info, remove_bart_extra_info)})

return odin_sentence

return spike_sentence

def append_odin(odin_sent, fixed_sentence, text):
cur_sent_text = text
cur_offset = 0

for node in fixed_sentence[len(odin_sent['words']):]:
if node.get_conllu_field('id').major == 0:
continue

if 'words' in odin_sent:
odin_sent['words'].append(node.get_conllu_field('form'))
if 'raw' in odin_sent:
odin_sent['raw'].append(node.get_conllu_field('form'))
if 'tags' in odin_sent:
odin_sent['tags'].append(node.get_conllu_field('xpos'))
if 'entities' in odin_sent:
odin_sent['entities'].append('O')
if ('startOffsets' in odin_sent) and ('endOffsets' in odin_sent):
odin_sent['startOffsets'].append(odin_sent['endOffsets'][-1] + 1)
odin_sent['endOffsets'].append(odin_sent['startOffsets'][-1] + len(node.get_conllu_field('form')))
if 'lemmas' in odin_sent:
odin_sent['lemmas'].append(node.get_conllu_field('lemma'))
if 'chunks' in odin_sent:
odin_sent['chunks'].append('O')

cur_sent_text += " " + node.get_conllu_field('form')
cur_offset += len(" " + node.get_conllu_field('form'))

return odin_sent, cur_sent_text, cur_offset

def conllu_to_spike(conllu_sentences, spike_to_enhance, remove_enhanced_extra_info, remove_bart_extra_info):
# ASSUMPTION - SPIKE doesnt allow node-adding conversions, so we dont need to fix text/offsets/etc
spike_sentences = []

def fix_offsets(odin_sent, all_offset):
if ('startOffsets' in odin_sent) and ('endOffsets' in odin_sent):
odin_sent['startOffsets'] = [(current + all_offset) for current in odin_sent['startOffsets']]
odin_sent['endOffsets'] = [(current + all_offset) for current in odin_sent['endOffsets']]
for i, (conllu_sentence, spike_sentence) in enumerate(zip(conllu_sentences, spike_to_enhance['sentences'])):
spike_sentences.append(
fix_graph(conllu_sentence, spike_sentence, remove_enhanced_extra_info, remove_bart_extra_info)
)

spike_to_enhance['sentences'] = spike_sentences

def conllu_to_odin(conllu_sentences, odin_to_enhance=None, is_basic=False, push_new_to_end=True, remove_enhanced_extra_info=False, remove_bart_extra_info=True):
odin_sentences = []
fixed_sentences = []
texts = []
summed_offset = 0

for i, conllu_sentence in enumerate(conllu_sentences):
fixed_sentence = conllu_sentence

if odin_to_enhance:
text = odin_to_enhance['text'][odin_to_enhance['sentences'][i]['startOffsets'][0]: odin_to_enhance['sentences'][i]['endOffsets'][-1]]

# fixing offsets may be to all sentences, as previous sentences may have become longer, changing all following offsets
fix_offsets(odin_to_enhance['sentences'][i], summed_offset)

# when added nodes appear fix sent
if any([tok.get_conllu_field("id").minor != 0 for tok in conllu_sentence]):
fixed_sentence = fix_sentence(fixed_sentence, push_new_to_end)
if odin_to_enhance:
odin_to_enhance['sentences'][i], text, cur_offset = append_odin(odin_to_enhance['sentences'][i], fixed_sentence, text)
summed_offset += cur_offset

# store updated text for each sentence
if odin_to_enhance:
texts.append(text)

# fix graph
fixed_sentences.append(fixed_sentence)
odin_sentences.append(fix_graph(
fixed_sentence, odin_to_enhance['sentences'][i] if odin_to_enhance else
{'words': [token.get_conllu_field("form") for token in fixed_sentence if token.get_conllu_field("id").major != 0],
'tags': [token.get_conllu_field("xpos") for token in fixed_sentence if token.get_conllu_field("id").major != 0]},
is_basic, remove_enhanced_extra_info, remove_bart_extra_info))

if odin_to_enhance:
odin_to_enhance['sentences'] = odin_sentences
odin_to_enhance['text'] = "\n".join(texts)
odin = odin_to_enhance
else:
odin = {"documents": {"": {
"id": str(uuid.uuid4()),
"text": " ".join([token.get_conllu_field("form") for conllu_sentence in fixed_sentences for token in
(sorted(conllu_sentence) if not push_new_to_end else conllu_sentence) if token.get_conllu_field("id").major != 0]),
"sentences": odin_sentences
}}, "mentions": []}

return odin
return spike_to_enhance


def parsed_tacred_json(data):
Expand Down
17 changes: 15 additions & 2 deletions pybart/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -1204,11 +1204,15 @@ def eudpp_demote_quantificational_modifiers_det(sentence, matches, converter):
tokens=[
Token(id="relativizer", spec=[Field(FieldNames.WORD, relativizing_words + list(relativizers_to_rel.keys()))]),
Token(id="gov"),
Token(id="mod")
Token(id="mod"),
# a special case where we have an xcomp father for the head of the clause we attach the subj of the xcomp
# e.g. "John is a man who like swimming", so "John" will become subj of like.
Token(id="grand", optional=True)
],
edges=[
Edge(child="mod", parent="gov", label=[HasLabelFromList(["acl:relcl"])]), # TODO: english: relcl
Edge(child="relativizer", parent="mod", label=[HasLabelFromList(["/.*/"])])
Edge(child="relativizer", parent="mod", label=[HasLabelFromList(["/.*/"])]),
Edge(child="gov", parent="grand", label=[HasLabelFromList(["xcomp"])])
],
)

Expand All @@ -1217,13 +1221,22 @@ def eudpp_add_ref_and_collapse(sentence, matches, converter):
gov = sentence[cur_match.token('gov')]
mod = sentence[cur_match.token('mod')]
relativizer = sentence[cur_match.token('relativizer')]
grand = cur_match.token('grand')

# this is a pretty basic case so we can assume only one label
label = list(cur_match.edge(cur_match.token('relativizer'), cur_match.token('mod')))[0]
text = relativizer.get_conllu_field("form").lower()
# some relativizers that were simply missing on the eUD, we added them as nmods
new_label = Label(udv(relativizers_to_rel[text]), eud=text if 1 == ud_version else "") if text in relativizers_to_rel else Label(label)

# for the special case we've added
if grand != -1:
grand = sentence[grand]
for child, rels in grand.get_children_with_rels():
for rel in rels:
if new_label.base == rel.base:
child.add_edge(new_label, mod)

reattach_children(relativizer, gov)
relativizer.replace_edge(Label(label), Label("ref"), mod, gov)
gov.add_edge(new_label, mod)
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setuptools.setup(
name="pybart-nlp",
version="3.2.5",
version="3.2.6",
author="Aryeh Tiktinsky",
author_email="aryehgigi@gmail.com",
description="python converter from UD-tree to BART-graph representations",
Expand Down

0 comments on commit e17e1bb

Please sign in to comment.