Skip to content

Commit

Permalink
Merge pull request #8 from UUDigitalHumanitieslab/subj-reworked
Browse files Browse the repository at this point in the history
Subj reworked
  • Loading branch information
JeltevanBoheemen authored Jan 26, 2023
2 parents 9580c94 + 44f1a54 commit 733b200
Show file tree
Hide file tree
Showing 17 changed files with 180 additions and 76 deletions.
41 changes: 41 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions

name: Unit tests

on:
workflow_dispatch:
push:
paths-ignore:
- '**.md'

jobs:
build:

runs-on: ubuntu-latest
strategy:
matrix:
python-version: ['3.7', '3.10']

steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install flake8
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Prepare PIP package
run: |
cd pypi
./prepare.sh
- name: Lint with flake8
run: |
flake8 $(cat pypi/include.txt | grep \.py\$) --count --max-complexity=12 --max-line-length=127 --statistics
# - name: Run unit tests
# run: |
# pip install pytest
# python -m pytest
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,17 @@
# Sastadev

[![Actions Status](https://github.com/UUDigitalHumanitieslab/sastadev/workflows/Unit%20tests/badge.svg)](https://github.com/UUDigitalHumanitieslab/sastadev/actions)

[pypi sastadev](https://pypi.org/project/sastadev)

Method definitions for use in SASTA

Copy `default_config.py` to your own `config.py` in the `sastadev` directory, and change what you need.

## Upload to PyPi

Specify the files which should be included in the package in `pypi/include.txt`.

```bash
cd pypi
./prepare.sh
Expand Down
7 changes: 4 additions & 3 deletions alpinoparsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
from memoize import memoize

import logging
from typing import Optional
#from sastatypes import SynTree, URL

#from config import SDLOGGER
Expand Down Expand Up @@ -90,6 +89,8 @@ def parse(origsent: str, escape: bool = True):
return None

#def previewurl(stree: SynTree) -> URL:


def previewurl(stree):
'''
The function *previewurl* returns the URL to preview the input SynTree *stree* in the GreTEL application.
Expand Down Expand Up @@ -122,9 +123,9 @@ def escape_alpino_input(instr: str) -> str:
result = ''
for c in instr:
if c == '[':
newc = '\['
newc = '\\['
elif c == ']':
newc = '\]'
newc = '\\]'
else:
newc = c
result += newc
Expand Down
31 changes: 15 additions & 16 deletions celexlexicon.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@
posre = re.compile(pospattern)



# dml columns
IdNum, Head, Inl, MorphStatus, MorphCnt, DerComp, Comp, Def, Imm, \
ImmSubCat, ImmAllo, ImmSubst, StrucLab, StrucAllo, StrucSubst, Sepa = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
Expand Down Expand Up @@ -151,21 +150,21 @@ def dcoiphi2celexpv(thesubj: SynTree, thepv: SynTree, inversion: bool) -> str:


celex2dcoimap: Dict[str, Dict[str, str]] =\
{'te1': {'pvtijd': 'tgw', 'pvagr': 'ev', 'wvorm': 'pv'},
'te2': {'pvtijd': 'tgw', 'pvagr': 'ev', 'wvorm': 'pv'},
'te2t': {'pvtijd': 'tgw', 'pvagr': 'met-t', 'wvorm': 'pv'},
'te3': {'pvtijd': 'tgw', 'pvagr': 'ev', 'wvorm': 'pv'},
'te3t': {'pvtijd': 'tgw', 'pvagr': 'met-t', 'wvorm': 'pv'},
'te2I': {'pvtijd': 'tgw', 'pvagr': 'ev', 'wvorm': 'pv'},
'tm': {'pvtijd': 'tgw', 'pvagr': 'mv', 'wvorm': 'pv'},
've': {'pvtijd': 'verl', 'pvagr': 'ev', 'wvorm': 'pv'},
'vm': {'pvtijd': 'verl', 'pvagr': 'mv', 'wvorm': 'pv'},
'i': {'wvorm': 'inf', 'positie': 'vrij', 'buiging': 'zonder'},
'pv': {'wvorm': 'vd', 'positie': 'vrij', 'buiging': 'zonder'},
'pt': {'wvorm': 'td', 'positie': 'vrij', 'buiging': 'zonder'},
'pvE': {'wvorm': 'vd', 'buiging': 'met-e', 'positie': 'prenom'},
'ptE': {'wvorm': 'td', 'buiging': 'met-e', 'positie': 'prenom'}
}
{'te1': {'pvtijd': 'tgw', 'pvagr': 'ev', 'wvorm': 'pv'},
'te2': {'pvtijd': 'tgw', 'pvagr': 'ev', 'wvorm': 'pv'},
'te2t': {'pvtijd': 'tgw', 'pvagr': 'met-t', 'wvorm': 'pv'},
'te3': {'pvtijd': 'tgw', 'pvagr': 'ev', 'wvorm': 'pv'},
'te3t': {'pvtijd': 'tgw', 'pvagr': 'met-t', 'wvorm': 'pv'},
'te2I': {'pvtijd': 'tgw', 'pvagr': 'ev', 'wvorm': 'pv'},
'tm': {'pvtijd': 'tgw', 'pvagr': 'mv', 'wvorm': 'pv'},
've': {'pvtijd': 'verl', 'pvagr': 'ev', 'wvorm': 'pv'},
'vm': {'pvtijd': 'verl', 'pvagr': 'mv', 'wvorm': 'pv'},
'i': {'wvorm': 'inf', 'positie': 'vrij', 'buiging': 'zonder'},
'pv': {'wvorm': 'vd', 'positie': 'vrij', 'buiging': 'zonder'},
'pt': {'wvorm': 'td', 'positie': 'vrij', 'buiging': 'zonder'},
'pvE': {'wvorm': 'vd', 'buiging': 'met-e', 'positie': 'prenom'},
'ptE': {'wvorm': 'td', 'buiging': 'met-e', 'positie': 'prenom'}
}


def celexpv2dcoi(word: str, infl: str, lemma: str) -> Dict[str, str]:
Expand Down
Empty file added celexlexicon/__init__.py
Empty file.
Empty file added celexlexicon/dutch/__init__.py
Empty file.
8 changes: 4 additions & 4 deletions deregularise.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,11 +303,11 @@ def CV(thestr):


def dup(thestr):
return(thestr + thestr)
return thestr + thestr


def endsin(stem, thechar):
return(stem[-1] == thechar)
return stem[-1] == thechar


def startswithprefix(stem):
Expand Down Expand Up @@ -486,7 +486,7 @@ def getstems(el):
takesge = False
else:
takesge = True
return(stem, stemFS, takesge)
return stem, stemFS, takesge


def makepastpartwithe(stem, stemFS, takesge, prefix='ge'):
Expand Down Expand Up @@ -627,7 +627,7 @@ def makeparadigm(word, forms):
#: two strings (corrected form, metadata) as value. This dictionary is filled by
#: reading from the file with the name in the constant *correctionfilename* upon
#: initialisation of the module *deregularise*
correction: Dict[str, Tuple[str,str]] = {}
correction: Dict[str, Tuple[str, str]] = {}
correctionfile = open(os.path.join(SD_DIR, correctionfilename), 'r', encoding='utf8')
myreader = csv.reader(correctionfile, delimiter=tab)
for row in myreader:
Expand Down
7 changes: 2 additions & 5 deletions lexicon.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
import treebankfunctions
from namepartlexicon import namepart_isa_namepart, namepart_isa_namepart_uc

from typing import Any, Dict, List, Optional, Tuple
from sastatypes import CELEXPosCode, CELEX_INFL, DCOITuple, DeHet, Lemma, SynTree, WordInfo
from typing import Any, Dict, List, Optional
from sastatypes import CELEX_INFL, DCOITuple, Lemma, SynTree, WordInfo

space = ' '

Expand All @@ -39,7 +39,6 @@
dets[het] = ['het', 'dat', 'dit', 'ons', 'welk', 'ieder', 'elk', 'zulk']



def isa_namepart(word: str) -> bool:
'''
is the word a name part
Expand All @@ -58,8 +57,6 @@ def isa_namepart_uc(word: str) -> bool:
return namepart_isa_namepart_uc(word)




def lookup(dct: Dict[str, Any], key: str) -> str:
'''
looks up key in dct, if so it returns dct[key] else ''
Expand Down
2 changes: 1 addition & 1 deletion metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,4 +104,4 @@ def mkSASTAMeta(token, nwt, name, value, cat, subcat=None, penalty=defaultpenalt
insertion = 'Insertion'
smallclause = 'Small Clause Treatment'
tokenmapping = 'Token Mapping'
insertiontokenmapping = 'Insertion Token Mapping'
insertiontokenmapping = 'Insertion Token Mapping'
2 changes: 2 additions & 0 deletions pypi/MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
include sastadev/LICENSE
include sastadev/*.txt
include sastadev/celexlexicon/dutch/*.txt
include sastadev/names/nameparts/namepartlexicon.csv
14 changes: 14 additions & 0 deletions pypi/__config__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,17 @@
#!/usr/bin/env python3
import logging
import os.path as op
import sentence_parser

# logging object
SDLOGGER = logging.getLogger()

SD_DIR = op.dirname(op.abspath(__file__))

# Alpino
ALPINO_HOST = 'localhost'
ALPINO_PORT = 7001

# Function to parse a sentence with Alpino
# Should take a string as input and return an lxml.etree
PARSE_FUNC = sentence_parser.parse
19 changes: 19 additions & 0 deletions pypi/include.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
__init__.py
alpinoparsing.py
celexlexicon
celexlexicon.py
deregularise.py
inflectioncorrection.tsv.txt
lexicon.py
LICENSE
memoize.py
metadata.py
namepartlexicon.py
names
py.typed
query.py
sastatoken.py
sastatypes.py
sentence_parser.py
stringfunctions.py
treebankfunctions.py
8 changes: 5 additions & 3 deletions pypi/prepare.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@
find sastadev/ -type f -not -name '.gitignore' -delete
TARGET=$PWD/../pypi/sastadev/
cp __config__.py $TARGET/config.py
cd ..
cp LICENSE __init__.py deregularise.py inflectioncorrection.tsv.txt py.typed $TARGET

cd pypi
while read SOURCE
do
cp -r ../$SOURCE $TARGET
done < include.txt

python setup.py sdist
15 changes: 11 additions & 4 deletions pypi/setup.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,22 @@
from setuptools import setup, find_packages
from setuptools import setup

with open('README.md') as file:
long_description = file.read()

setup(
name='sastadev',
python_requires='>=3.5, <4',
version='0.0.2',
python_requires='>=3.7, <4',
version='0.0.3',
description='Linguistic functions for SASTA tool',
long_description=long_description,
long_description_content_type="text/markdown",
author='Digital Humanities Lab, Utrecht University',
author_email='digitalhumanities@uu.nl',
url='https://github.com/UUDigitalHumanitieslab/sastadev',
license='BSD-3-Clause',
include_package_data=True,
packages=['sastadev'],
package_data={'sastadev': ['*.txt', 'LICENSE', 'py.typed']}
package_data={
'sastadev': ['*.txt', 'LICENSE', 'py.typed']
}
)
9 changes: 3 additions & 6 deletions sastatypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
This module contains definitions of types used in multiple modules
'''

from typing import Dict, List, Any, Tuple, Callable, Pattern, Optional, NewType, Union
from lxml import etree # type: ignore
from typing import Dict, List, Tuple, Callable, Optional, Union
from lxml import etree # type: ignore
from collections import Counter
from query import Query
from sastatoken import Token
Expand All @@ -18,7 +18,7 @@
LocationName = str
DCOIPt = str
DeHet = str
CELEX_INFL =str
CELEX_INFL = str
DCOITuple = Tuple
Lemma = str
CorrectionMode = str # Literal['0','1','n']
Expand Down Expand Up @@ -72,6 +72,3 @@
#CoreQueryFunction = Callable[[SynTree], List[SynTree]]
#PostQueryFunction = Callable[[SynTree, allresults.AllResults], List[SynTree]]
#QueryFunction = Union[CoreQueryFunction, PostQueryFunction]



2 changes: 1 addition & 1 deletion stringfunctions.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
tab = '\t'
comma = ','

csvre = "'[^']+'|[^,' ]+" #for selecting nonempty tokens from a csvstring ; comma between single quotes is allowed
csvre = "'[^']+'|[^,' ]+" # for selecting nonempty tokens from a csvstring ; comma between single quotes is allowed
csvpat = re.compile(csvre)

wpat = r'^.*\w.*$'
Expand Down
Loading

0 comments on commit 733b200

Please sign in to comment.