Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/mwestats integration #7

Merged
merged 5 commits into from
Jun 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
14 changes: 8 additions & 6 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,17 @@ jobs:
run: |
# stop the build if there are Python syntax errors or undefined names
flake8 mwe_query --count --select=E9,F63,F7,F82 --show-source --statistics
flake8 tests --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
# TODO: lower line length, lower complexity
flake8 mwe_query --count --exit-zero --max-complexity=22 --max-line-length=261 --statistics
# TODO:
# - name: Check typing
# run: |
# pip install mypy
# pip install lxml-stubs
# mypy **/*.py
# TODO:
# flake8 tests --count --exit-zero --max-complexity=22 --max-line-length=261 --statistics
- name: Check typing
run: |
pip install mypy
pip install lxml-stubs
mypy **/*.py
- name: Run unit tests
run: |
python -m unittest discover tests/
55 changes: 32 additions & 23 deletions mwe_query/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
#!/usr/bin/env python3
__author__ = 'marti'
import re
from alpino_query import parse_sentence
from alpino_query import parse_sentence # type: ignore
from copy import deepcopy
from typing import Dict, List, Optional
from typing import cast, Dict, List, Optional
import time
from .basex_query import list_databases, perform_xpath
import os
Expand Down Expand Up @@ -76,7 +76,7 @@ def __xml_to_xpath(self, root: ET.Element, number_of_child_nodes='loose', includ
# children.append('('+xml_to_xpath(elem, number_of_child_nodes=number_of_child_nodes, include_passives=include_passives) + ' or ' + xml_to_xpath(by_subject, number_of_child_nodes=number_of_child_nodes, include_passives=include_passives) + ')')
if elem.attrib.get('cat', None) == 'np' and [grandchild.attrib.get('pt', None) for grandchild in elem] in [['n'], ['ww']]:
grandchild = deepcopy([grandchild for grandchild in elem][0])
grandchild.attrib['rel'] = elem.attrib.get('rel', None)
grandchild.attrib['rel'] = elem.attrib.get('rel', '')
alternatives.append(grandchild)
# children.append('(' + xml_to_xpath(elem, number_of_child_nodes=number_of_child_nodes, include_passives=include_passives) + ' or ' + xml_to_xpath(grandchild, number_of_child_nodes=number_of_child_nodes, include_passives=include_passives) + ')')
if alternatives == [elem]:
Expand All @@ -98,18 +98,26 @@ def __remove_node_from_tree(self, root: ET.Element, id: str) -> None:
node = root.find(f'.//node[@id="{id}"]')
parent = root.find(f'.//node[@id="{id}"]...')
if parent is not None:
parent.remove(node)
parent.remove(cast(ET.Element, node))

def set_tree(self, alpino_xml: str) -> None:
self.parsed = ET.fromstring(alpino_xml)

def generate_queries(self) -> List['MweQuery']:
def generate_queries(self) -> List['MweQuery']: # noqa: C901
"""_summary_

Returns:
_type_: _description_
"""
# expand index nodes in parse
mwe = expand_index_nodes(self.parsed)
generated: List[MweQuery] = []

if self.head == 'v':
mwe = mwe.find('.//node[@rel="vc"]')
vc = mwe.find('.//node[@rel="vc"]')
if vc is None:
raise ValueError('no @rel="vc" in expression')
mwe = vc
while True: # remove "trailing" top nodes
if len(mwe) == 1:
mwe = mwe[0]
Expand Down Expand Up @@ -161,9 +169,9 @@ def generate_queries(self) -> List['MweQuery']:
# als zinscomplement:
# hd/bw van pc/pp
# let op andere r-pronomina (hiervan, daarvan)
xpath_1 = [self.__xml_to_xpath(child, number_of_child_nodes='strict')
for child in mwe]
xpath_1 = '//node[' + ' and '.join(xpath_1) + ']'
xpath_1_parts = [self.__xml_to_xpath(child, number_of_child_nodes='strict')
for child in mwe]
xpath_1 = '//node[' + ' and '.join(xpath_1_parts) + ']'
generated.append(
MweQuery(self, description='multi-word expression', xpath=xpath_1, rank=1))

Expand All @@ -185,11 +193,11 @@ def generate_queries(self) -> List['MweQuery']:
for feat in list(node.attrib.keys()):
if feat not in ['lemma', 'pt']:
node.attrib.pop(feat, None)
xpath_3 = [node for node in mwe.iter() if set(
xpath_3_elements = [node for node in mwe.iter() if set(
node.attrib.keys()) != set()]
xpath_3 = ['..//' + self.__xml_to_xpath(node) for node in xpath_3]
xpath_3_parts = ['..//' + self.__xml_to_xpath(node) for node in xpath_3_elements]
# this assumes a single top node
xpath_3 = '/node[' + ' and '.join(xpath_3) + ']'
xpath_3 = '/node[' + ' and '.join(xpath_3_parts) + ']'
generated.append(
MweQuery(self, description='superset', xpath=xpath_3, rank=3))

Expand Down Expand Up @@ -224,12 +232,13 @@ def run_query(self, database: str, output_folder: str, max_trees=None, from_file
for x in result:
i += 1
tree = ET.fromstring(x)
sentence = [
child.text for child in tree if child.tag == 'sentence'][0]
output_treebank.write(ET.tostring(tree).decode() + '\n')
output_plain.write(sentence + '\n')
if i == max_trees:
break
sentences = [
child.text or '' for child in tree if child.tag == 'sentence']
if sentences and sentences[0]:
output_treebank.write(ET.tostring(tree).decode() + '\n')
output_plain.write(sentences[0] + '\n')
if i == max_trees:
break
output_treebank.write('</treebank>')
output_treebank.close()
output_plain.close()
Expand All @@ -242,20 +251,20 @@ def handle_rel_rhd(node: ET.Element, sentence: ET.Element) -> Optional[ET.Elemen
id_ = node.attrib['id']
parent = sentence.find(f'.//node[@id="{id_}"]...')
if parent is None:
return
return None
elif parent.attrib.get('cat') != 'rel':
return
return None
if node.attrib.get('word') == 'zoals':
# TODO zoals als rhd
print("WARNING: encountered 'zoals' as relative head. Ignoring for now, not fully implemented. Filling in dummy 'zo'.")
return ET.Element('node', attrib={'frame': 'adverb', 'id': id_, 'lcat': 'advp',
'pos': 'adv', 'root': 'zo', 'sense': 'zo',
'word': 'zo', 'lemma': 'zo', 'pt': 'bw', })
antecedent = sentence.find(f'.//node[@id="{id_}"]....')
if antecedent.attrib.get('cat') == 'conj':
if antecedent and antecedent.attrib.get('cat') == 'conj':
antecedent = sentence.find(f'.//node[@id="{id_}"]......')
if antecedent.attrib.get('cat') in ['top', 'du']:
return
if not antecedent or antecedent.attrib.get('cat') in ['top', 'du']:
return None

node_copy = deepcopy(node)
antecedent = deepcopy(antecedent)
Expand Down
2 changes: 1 addition & 1 deletion mwe_query/basex_query.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
__author__ = 'marti'
from BaseXClient import BaseXClient
from BaseXClient import BaseXClient # type: ignore
import os

basex_location = 'C:/Program Files (x86)/BaseX/data'
Expand Down
Loading