Merge pull request #8 from UUDigitalHumanitieslab/subj-reworked

Subj reworked
UUDigitalHumanitieslab · Jan 26, 2023 · 733b200 · 733b200
2 parents 9580c94 + 44f1a54
commit 733b200
Show file tree

Hide file tree

Showing 17 changed files with 180 additions and 76 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -0,0 +1,41 @@
+# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
+# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
+
+name: Unit tests
+
+on:
+  workflow_dispatch:
+  push:
+    paths-ignore:
+      - '**.md'
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ['3.7', '3.10']
+
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install flake8
+        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+    - name: Prepare PIP package
+      run: |
+        cd pypi
+        ./prepare.sh
+    - name: Lint with flake8
+      run: |
+        flake8  $(cat pypi/include.txt | grep \.py\$) --count --max-complexity=12 --max-line-length=127 --statistics
+    # - name: Run unit tests
+    #   run: |
+    #     pip install pytest
+    #     python -m pytest
diff --git a/README.md b/README.md
@@ -1,11 +1,17 @@
 # Sastadev
 
+[![Actions Status](https://github.com/UUDigitalHumanitieslab/sastadev/workflows/Unit%20tests/badge.svg)](https://github.com/UUDigitalHumanitieslab/sastadev/actions)
+
+[pypi sastadev](https://pypi.org/project/sastadev)
+
 Method definitions for use in SASTA
 
 Copy `default_config.py` to your own `config.py` in the `sastadev` directory, and change what you need.
 
 ## Upload to PyPi
 
+Specify the files which should be included in the package in `pypi/include.txt`.
+
 ```bash
 cd pypi
 ./prepare.sh

diff --git a/alpinoparsing.py b/alpinoparsing.py
@@ -19,7 +19,6 @@
 from memoize import memoize
 
 import logging
-from typing import Optional
 #from sastatypes import SynTree, URL
 
 #from config import SDLOGGER
@@ -90,6 +89,8 @@ def parse(origsent: str, escape: bool = True):
             return None
 
 #def previewurl(stree: SynTree) -> URL:
+
+
 def previewurl(stree):
     '''
     The function *previewurl* returns the URL to preview the input SynTree *stree* in the GreTEL application.
@@ -122,9 +123,9 @@ def escape_alpino_input(instr: str) -> str:
     result = ''
     for c in instr:
         if c == '[':
-            newc = '\['
+            newc = '\\['
         elif c == ']':
-            newc = '\]'
+            newc = '\\]'
         else:
             newc = c
         result += newc

diff --git a/celexlexicon.py b/celexlexicon.py
@@ -41,7 +41,6 @@
 posre = re.compile(pospattern)
 
 
-
 # dml columns
 IdNum, Head, Inl, MorphStatus, MorphCnt, DerComp, Comp, Def, Imm, \
     ImmSubCat, ImmAllo, ImmSubst, StrucLab, StrucAllo, StrucSubst, Sepa = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
@@ -151,21 +150,21 @@ def dcoiphi2celexpv(thesubj: SynTree, thepv: SynTree, inversion: bool) -> str:
 
 
 celex2dcoimap: Dict[str, Dict[str, str]] =\
-                {'te1': {'pvtijd': 'tgw', 'pvagr': 'ev', 'wvorm': 'pv'},
-                 'te2': {'pvtijd': 'tgw', 'pvagr': 'ev', 'wvorm': 'pv'},
-                 'te2t': {'pvtijd': 'tgw', 'pvagr': 'met-t', 'wvorm': 'pv'},
-                 'te3': {'pvtijd': 'tgw', 'pvagr': 'ev', 'wvorm': 'pv'},
-                 'te3t': {'pvtijd': 'tgw', 'pvagr': 'met-t', 'wvorm': 'pv'},
-                 'te2I': {'pvtijd': 'tgw', 'pvagr': 'ev', 'wvorm': 'pv'},
-                 'tm': {'pvtijd': 'tgw', 'pvagr': 'mv', 'wvorm': 'pv'},
-                 've': {'pvtijd': 'verl', 'pvagr': 'ev', 'wvorm': 'pv'},
-                 'vm': {'pvtijd': 'verl', 'pvagr': 'mv', 'wvorm': 'pv'},
-                 'i': {'wvorm': 'inf', 'positie': 'vrij', 'buiging': 'zonder'},
-                 'pv': {'wvorm': 'vd', 'positie': 'vrij', 'buiging': 'zonder'},
-                 'pt': {'wvorm': 'td', 'positie': 'vrij', 'buiging': 'zonder'},
-                 'pvE': {'wvorm': 'vd', 'buiging': 'met-e', 'positie': 'prenom'},
-                 'ptE': {'wvorm': 'td', 'buiging': 'met-e', 'positie': 'prenom'}
-                 }
+    {'te1': {'pvtijd': 'tgw', 'pvagr': 'ev', 'wvorm': 'pv'},
+     'te2': {'pvtijd': 'tgw', 'pvagr': 'ev', 'wvorm': 'pv'},
+     'te2t': {'pvtijd': 'tgw', 'pvagr': 'met-t', 'wvorm': 'pv'},
+     'te3': {'pvtijd': 'tgw', 'pvagr': 'ev', 'wvorm': 'pv'},
+     'te3t': {'pvtijd': 'tgw', 'pvagr': 'met-t', 'wvorm': 'pv'},
+     'te2I': {'pvtijd': 'tgw', 'pvagr': 'ev', 'wvorm': 'pv'},
+     'tm': {'pvtijd': 'tgw', 'pvagr': 'mv', 'wvorm': 'pv'},
+     've': {'pvtijd': 'verl', 'pvagr': 'ev', 'wvorm': 'pv'},
+     'vm': {'pvtijd': 'verl', 'pvagr': 'mv', 'wvorm': 'pv'},
+     'i': {'wvorm': 'inf', 'positie': 'vrij', 'buiging': 'zonder'},
+     'pv': {'wvorm': 'vd', 'positie': 'vrij', 'buiging': 'zonder'},
+     'pt': {'wvorm': 'td', 'positie': 'vrij', 'buiging': 'zonder'},
+     'pvE': {'wvorm': 'vd', 'buiging': 'met-e', 'positie': 'prenom'},
+     'ptE': {'wvorm': 'td', 'buiging': 'met-e', 'positie': 'prenom'}
+     }
 
 
 def celexpv2dcoi(word: str, infl: str, lemma: str) -> Dict[str, str]:

diff --git a/celexlexicon/__init__.py b/celexlexicon/__init__.py
diff --git a/celexlexicon/dutch/__init__.py b/celexlexicon/dutch/__init__.py
diff --git a/deregularise.py b/deregularise.py
@@ -303,11 +303,11 @@ def CV(thestr):
 
 
 def dup(thestr):
-    return(thestr + thestr)
+    return thestr + thestr
 
 
 def endsin(stem, thechar):
-    return(stem[-1] == thechar)
+    return stem[-1] == thechar
 
 
 def startswithprefix(stem):
@@ -486,7 +486,7 @@ def getstems(el):
         takesge = False
     else:
         takesge = True
-    return(stem, stemFS, takesge)
+    return stem, stemFS, takesge
 
 
 def makepastpartwithe(stem, stemFS, takesge, prefix='ge'):
@@ -627,7 +627,7 @@ def makeparadigm(word, forms):
 #: two strings (corrected form, metadata) as value. This dictionary is filled by
 #: reading from the file with the name in the constant *correctionfilename* upon
 #: initialisation of the module *deregularise*
-correction: Dict[str, Tuple[str,str]] = {}
+correction: Dict[str, Tuple[str, str]] = {}
 correctionfile = open(os.path.join(SD_DIR, correctionfilename), 'r', encoding='utf8')
 myreader = csv.reader(correctionfile, delimiter=tab)
 for row in myreader:

diff --git a/lexicon.py b/lexicon.py
@@ -13,8 +13,8 @@
 import treebankfunctions
 from namepartlexicon import namepart_isa_namepart, namepart_isa_namepart_uc
 
-from typing import Any, Dict, List, Optional, Tuple
-from sastatypes import CELEXPosCode, CELEX_INFL, DCOITuple, DeHet, Lemma, SynTree, WordInfo
+from typing import Any, Dict, List, Optional
+from sastatypes import CELEX_INFL, DCOITuple, Lemma, SynTree, WordInfo
 
 space = ' '
 
@@ -39,7 +39,6 @@
 dets[het] = ['het', 'dat', 'dit', 'ons', 'welk', 'ieder', 'elk', 'zulk']
 
 
-
 def isa_namepart(word: str) -> bool:
     '''
     is the word a name part
@@ -58,8 +57,6 @@ def isa_namepart_uc(word: str) -> bool:
     return namepart_isa_namepart_uc(word)
 
 
-
-
 def lookup(dct: Dict[str, Any], key: str) -> str:
     '''
     looks up key in dct, if so it returns dct[key] else ''

diff --git a/metadata.py b/metadata.py
@@ -104,4 +104,4 @@ def mkSASTAMeta(token, nwt, name, value, cat, subcat=None, penalty=defaultpenalt
 insertion = 'Insertion'
 smallclause = 'Small Clause Treatment'
 tokenmapping = 'Token Mapping'
-insertiontokenmapping = 'Insertion Token Mapping'
+insertiontokenmapping = 'Insertion Token Mapping'
diff --git a/pypi/MANIFEST.in b/pypi/MANIFEST.in
@@ -1,2 +1,4 @@
 include sastadev/LICENSE
 include sastadev/*.txt
+include sastadev/celexlexicon/dutch/*.txt
+include sastadev/names/nameparts/namepartlexicon.csv
diff --git a/pypi/__config__.py b/pypi/__config__.py
@@ -1,3 +1,17 @@
 #!/usr/bin/env python3
+import logging
 import os.path as op
+import sentence_parser
+
+# logging object
+SDLOGGER = logging.getLogger()
+
 SD_DIR = op.dirname(op.abspath(__file__))
+
+# Alpino
+ALPINO_HOST = 'localhost'
+ALPINO_PORT = 7001
+
+# Function to parse a sentence with Alpino
+# Should take a string as input and return an lxml.etree
+PARSE_FUNC = sentence_parser.parse
diff --git a/pypi/include.txt b/pypi/include.txt
@@ -0,0 +1,19 @@
+__init__.py
+alpinoparsing.py
+celexlexicon
+celexlexicon.py
+deregularise.py
+inflectioncorrection.tsv.txt
+lexicon.py
+LICENSE
+memoize.py
+metadata.py
+namepartlexicon.py
+names
+py.typed
+query.py
+sastatoken.py
+sastatypes.py
+sentence_parser.py
+stringfunctions.py
+treebankfunctions.py
diff --git a/pypi/prepare.sh b/pypi/prepare.sh
@@ -2,8 +2,10 @@
 find sastadev/ -type f -not -name '.gitignore' -delete
 TARGET=$PWD/../pypi/sastadev/
 cp __config__.py $TARGET/config.py
-cd ..
-cp LICENSE __init__.py deregularise.py inflectioncorrection.tsv.txt py.typed $TARGET
 
-cd pypi
+while read SOURCE
+do
+    cp -r ../$SOURCE $TARGET
+done < include.txt
+
 python setup.py sdist
diff --git a/pypi/setup.py b/pypi/setup.py
@@ -1,15 +1,22 @@
-from setuptools import setup, find_packages
+from setuptools import setup
+
+with open('README.md') as file:
+    long_description = file.read()
 
 setup(
     name='sastadev',
-    python_requires='>=3.5, <4',
-    version='0.0.2',
+    python_requires='>=3.7, <4',
+    version='0.0.3',
     description='Linguistic functions for SASTA tool',
+    long_description=long_description,
+    long_description_content_type="text/markdown",
     author='Digital Humanities Lab, Utrecht University',
     author_email='digitalhumanities@uu.nl',
     url='https://github.com/UUDigitalHumanitieslab/sastadev',
     license='BSD-3-Clause',
     include_package_data=True,
     packages=['sastadev'],
-    package_data={'sastadev': ['*.txt', 'LICENSE', 'py.typed']}
+    package_data={
+        'sastadev': ['*.txt', 'LICENSE', 'py.typed']
+    }
 )
diff --git a/sastatypes.py b/sastatypes.py
@@ -2,8 +2,8 @@
 This module contains definitions of types used in multiple modules
 '''
 
-from typing import Dict, List, Any, Tuple, Callable, Pattern, Optional, NewType, Union
-from lxml import etree # type: ignore
+from typing import Dict, List, Tuple, Callable, Optional, Union
+from lxml import etree  # type: ignore
 from collections import Counter
 from query import Query
 from sastatoken import Token
@@ -18,7 +18,7 @@
 LocationName = str
 DCOIPt = str
 DeHet = str
-CELEX_INFL =str
+CELEX_INFL = str
 DCOITuple = Tuple
 Lemma = str
 CorrectionMode = str  # Literal['0','1','n']
@@ -72,6 +72,3 @@
 #CoreQueryFunction = Callable[[SynTree], List[SynTree]]
 #PostQueryFunction = Callable[[SynTree, allresults.AllResults], List[SynTree]]
 #QueryFunction = Union[CoreQueryFunction, PostQueryFunction]
-
-
-
diff --git a/stringfunctions.py b/stringfunctions.py
@@ -9,7 +9,7 @@
 tab = '\t'
 comma = ','
 
-csvre = "'[^']+'|[^,' ]+"  #for selecting nonempty tokens from a csvstring ; comma between single quotes is allowed
+csvre = "'[^']+'|[^,' ]+"  # for selecting nonempty tokens from a csvstring ; comma between single quotes is allowed
 csvpat = re.compile(csvre)
 
 wpat = r'^.*\w.*$'