From 2d5dba7e9c954f59d6fac0d55e9def79b116a129 Mon Sep 17 00:00:00 2001 From: Jelte van Boheemen Date: Thu, 14 Dec 2023 10:38:39 +0100 Subject: [PATCH 01/36] Add literal field to query --- .../migrations/0033_assessmentquery_literal.py | 18 ++++++++++++++++++ backend/analysis/models.py | 1 + 2 files changed, 19 insertions(+) create mode 100644 backend/analysis/migrations/0033_assessmentquery_literal.py diff --git a/backend/analysis/migrations/0033_assessmentquery_literal.py b/backend/analysis/migrations/0033_assessmentquery_literal.py new file mode 100644 index 00000000..336e1992 --- /dev/null +++ b/backend/analysis/migrations/0033_assessmentquery_literal.py @@ -0,0 +1,18 @@ +# Generated by Django 3.1.14 on 2023-11-15 15:46 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('analysis', '0032_method_subj_adapt'), + ] + + operations = [ + migrations.AddField( + model_name='assessmentquery', + name='literal', + field=models.CharField(blank=True, default='', max_length=200), + ), + ] diff --git a/backend/analysis/models.py b/backend/analysis/models.py index 1d4c06a6..117d469c 100644 --- a/backend/analysis/models.py +++ b/backend/analysis/models.py @@ -289,6 +289,7 @@ class AssessmentQuery(models.Model): inform = models.CharField(max_length=20, blank=True, default='') screening = models.CharField(max_length=20, blank=True, default=True) process = models.IntegerField(blank=True, null=True) + literal = models.CharField(max_length=200, blank=True, default='') stars = models.CharField(max_length=50, blank=True, default='') filter = models.CharField(max_length=200, blank=True, default='') variants = models.CharField(max_length=200, blank=True, default='') From d9eaf397bdd6907909706ffbd89242e896596610 Mon Sep 17 00:00:00 2001 From: Jelte van Boheemen Date: Wed, 3 Jan 2024 14:17:09 +0100 Subject: [PATCH 02/36] fix safreader tests --- backend/analysis/annotations/safreader_test.py | 4 +++- backend/analysis/conftest.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/backend/analysis/annotations/safreader_test.py b/backend/analysis/annotations/safreader_test.py index 105d96bf..7600cce3 100644 --- a/backend/analysis/annotations/safreader_test.py +++ b/backend/analysis/annotations/safreader_test.py @@ -23,7 +23,9 @@ def test_read_saf(method, transcript, filedir, samplenum): read_results = reader.document.to_allresults() # are the coreresults the same? - assert sorted(read_results.coreresults.keys()) == sorted(true_results.coreresults.keys()) + sorted_read = sorted(read_results.coreresults.keys()) + sorted_true = sorted(true_results.coreresults.keys()) + assert sorted_read == sorted_true for q, hits in read_results.coreresults.items(): true_hits = true_results.coreresults[q] assert hits == true_hits diff --git a/backend/analysis/conftest.py b/backend/analysis/conftest.py index 82fb8669..93ba1a4a 100644 --- a/backend/analysis/conftest.py +++ b/backend/analysis/conftest.py @@ -65,7 +65,7 @@ def tarsp_method(db, tarsp_category, method_dir): @pytest.fixture def asta_method(db, asta_category, method_dir): - file = glob.glob(f'{method_dir}/ASTA_Index_Current.xlsx')[0] + file = glob.glob(f'{method_dir}/ASTA Index Current.xlsx')[0] with open(file, 'rb') as f: wrapped_file = File(f) instance = AssessmentMethod(name='asta_test_method', category=asta_category) From 8d7dd531a0f4b958c52cfff444ff00f73473f226 Mon Sep 17 00:00:00 2001 From: Jelte van Boheemen Date: Wed, 3 Jan 2024 14:58:17 +0100 Subject: [PATCH 03/36] Move some conftest up --- backend/analysis/conftest.py | 59 ----------------------- backend/conftest.py | 73 +++++++++++++++++++++++++++++ backend/convert/chat_reader_test.py | 3 -- backend/convert/chat_writer_test.py | 2 - 4 files changed, 73 insertions(+), 64 deletions(-) create mode 100644 backend/conftest.py diff --git a/backend/analysis/conftest.py b/backend/analysis/conftest.py index 93ba1a4a..174a9b67 100644 --- a/backend/analysis/conftest.py +++ b/backend/analysis/conftest.py @@ -15,65 +15,6 @@ CORRECTIONS_ASTA_16 = '{"Pause": [["15", "Pause", "[\'(..)\']", "CHAT", "None", "None", "uh (..) BEROEP1", "BEROEP1"], ["33", "Pause", "[\'(.)\']", "CHAT", "None", "None", "(.) ja (.) ja ik weet het niet", "ik weet het niet"], ["33", "Pause", "[\'(.)\']", "CHAT", "None", "None", "(.) ja (.) ja ik weet het niet", "ik weet het niet"], ["38", "Pause", "[\'(..)\']", "CHAT", "None", "None", "dat heb ik net nog gelezen (..)", null], ["45", "Pause", "[\'(..)\']", "CHAT", "None", "None", "oo (..) uh ja uh uh (..) ik zing met uh oudere mensen uh", "ik zing met oudere mensen"], ["45", "Pause", "[\'(..)\']", "CHAT", "None", "None", "oo (..) uh ja uh uh (..) ik zing met uh oudere mensen uh", "ik zing met oudere mensen"], ["46", "Pause", "[\'(..)\']", "CHAT", "None", "None", "ik uh (..) uh ik doe uh boekjes voor uh club geloof ik uh", "ik doe boekjes voor club geloof ik"], ["47", "Pause", "[\'(...)\']", "CHAT", "None", "None", "en uh (...) uh (.) uh sorry", "en"], ["47", "Pause", "[\'(.)\']", "CHAT", "None", "None", "en uh (...) uh (.) uh sorry", "en"]], "parsed_as": [["1", "parsed_as", "ik vind het beetje moeilijk om het goed te vertellen want ik heb een ongeluk gehad", "SASTA", "Correction", "None", "ja uh ik vind het beetje moeilijk om het goed te vertellen want ik heb een ongeluk gehad ", "ik vind het beetje moeilijk om het goed te vertellen want ik heb een ongeluk gehad"], ["4", "parsed_as", "en nu krijg ik te horen", "SASTA", "Correction", "None", "en uh nu krijg ik te horen", "en nu krijg ik te horen"], ["6", "parsed_as", "en verder het gaat redelijk denk ik", "SASTA", "Correction", "None", "en verder ja het gaat redelijk denk ik", "en verder het gaat redelijk denk ik"], ["7", "parsed_as", "ik ben eerst naar een ziekenhuis geweest een aantal weken", "SASTA", "Correction", "None", "oh ja sorry ja ik ben eerst uh naar een ziekenhuis geweest een aantal weken", "ik ben eerst naar een ziekenhuis geweest een aantal weken"], ["8", "parsed_as", "toen een aantal weken in een iets van zorg", "SASTA", "Correction", "None", "toen een aantal weken in een een iets van zorg ", "toen een aantal weken in een iets van zorg"], ["9", "parsed_as", "ik weet niet in uh( . )", "SASTA", "Correction", "None", "ik weet niet uh in uh(.)", "ik weet niet in uh( . )"], ["10", "parsed_as", "buiten is Breda", "SASTA", "Correction", "None", "uh buiten Breda ", "buiten is Breda"], ["13", "parsed_as", "en toen ik zo ver weer was ben ik naar hier gekomen", "SASTA", "Correction", "None", "en en toen ik zo ver weer was ben ik naar hier gekomen", "en toen ik zo ver weer was ben ik naar hier gekomen"], ["14", "parsed_as", "dat heet de ZORGINSTELLING1", "SASTA", "Correction", "None", "dat heet de uh ZORGINSTELLING1", "dat heet de ZORGINSTELLING1"], ["15", "parsed_as", "BEROEP1", "SASTA", "Correction", "None", "uh (..) BEROEP1", "BEROEP1"], ["16", "parsed_as", "is heel erg leuk", "SASTA", "Correction", "None", "ja is heel erg leuk ja", "is heel erg leuk"], ["20", "parsed_as", "en is ook leuk kon kinderen zo gezellig zo lief zo fijn", "SASTA", "Correction", "None", "en is ook leuk kon kinderen zo gezellig zo lief zo fijn ja", "en is ook leuk kon kinderen zo gezellig zo lief zo fijn"], ["21", "parsed_as", "en ook ouders heel goed contact", "SASTA", "Correction", "None", "ja en ook ouders heel goed contact", "en ook ouders heel goed contact"], ["22", "parsed_as", "kinderen worden gebracht", "SASTA", "Correction", "None", "ki kinderen worden gebracht", "kinderen worden gebracht"], ["23", "parsed_as", "en dan is het contact goed met de ouders", "SASTA", "Correction", "None", "en uh dan is het contact goed met de ouders", "en dan is het contact goed met de ouders"], ["25", "parsed_as", "en kinderen zijn fijn lief", "SASTA", "Correction", "None", "en kinderen zijn ja fijn lief", "en kinderen zijn fijn lief"], ["26", "parsed_as", "en voelen zich wel gelukkig bij ons", "SASTA", "Correction", "None", "en voelen zich wel uh voelen zich wel gelukkig bij ons", "en voelen zich wel gelukkig bij ons"], ["28", "parsed_as", "ik werk drie dagen", "SASTA", "Correction", "None", "oo uh uh ik werk drie dagen", "ik werk drie dagen"], ["29", "parsed_as", "ik begin ik om half acht tot ik denk tot zes uur", "SASTA", "Correction", "None", "ja ik begin ik om uh half acht tot ik denk tot zes uur ja", "ik begin ik om half acht tot ik denk tot zes uur"], ["30", "parsed_as", "toevallig hierachter", "SASTA", "Correction", "None", "ja toevallig hierachter ", "toevallig hierachter"], ["31", "parsed_as", "kinderen met beperking", "SASTA", "Correction", "None", "uh kinderen met beperking", "kinderen met beperking"], ["33", "parsed_as", "ik weet het niet", "SASTA", "Correction", "None", "(.) ja (.) ja ik weet het niet", "ik weet het niet"], ["35", "parsed_as", "ik herken het", "SASTA", "Correction", "None", "ik ik herken het", "ik herken het"], ["36", "parsed_as", "daar ben ik veel geweest", "SASTA", "Correction", "None", "wauw daar ben ik veel geweest", "daar ben ik veel geweest"], ["37", "parsed_as", "ook hier de", "SASTA", "Correction", "None", "ook hier de uh", "ook hier de"], ["40", "parsed_as", "veel geweest voor", "SASTA", "Correction", "None", "veel geweest voor uh", "veel geweest voor"], ["42", "parsed_as", "net gekoppeld", "SASTA", "Correction", "None", "ja uh net gekoppeld", "net gekoppeld"], ["44", "parsed_as", "dus maar ben ik veel geweest maar ook als ik hier", "SASTA", "Correction", "None", "dus maar ben ik veel geweest maar ook als ik hier uh", "dus maar ben ik veel geweest maar ook als ik hier"], ["45", "parsed_as", "ik zing met oudere mensen", "SASTA", "Correction", "None", "oo (..) uh ja uh uh (..) ik zing met uh oudere mensen uh", "ik zing met oudere mensen"], ["46", "parsed_as", "ik doe boekjes voor club geloof ik", "SASTA", "Correction", "None", "ik uh (..) uh ik doe uh boekjes voor uh club geloof ik uh", "ik doe boekjes voor club geloof ik"], ["47", "parsed_as", "en", "SASTA", "Correction", "None", "en uh (...) uh (.) uh sorry", "en"]], "ExtraGrammatical": [["1", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "ja uh ik vind het beetje moeilijk om het goed te vertellen want ik heb een ongeluk gehad ", "ik vind het beetje moeilijk om het goed te vertellen want ik heb een ongeluk gehad"], ["1", "ExtraGrammatical", "ja, nee or nou filled pause", "SASTA", "Syntax", "None", "ja uh ik vind het beetje moeilijk om het goed te vertellen want ik heb een ongeluk gehad ", "ik vind het beetje moeilijk om het goed te vertellen want ik heb een ongeluk gehad"], ["4", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "en uh nu krijg ik te horen", "en nu krijg ik te horen"], ["6", "ExtraGrammatical", "ja, nee or nou filled pause", "SASTA", "Syntax", "None", "en verder ja het gaat redelijk denk ik", "en verder het gaat redelijk denk ik"], ["7", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "oh ja sorry ja ik ben eerst uh naar een ziekenhuis geweest een aantal weken", "ik ben eerst naar een ziekenhuis geweest een aantal weken"], ["7", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "oh ja sorry ja ik ben eerst uh naar een ziekenhuis geweest een aantal weken", "ik ben eerst naar een ziekenhuis geweest een aantal weken"], ["7", "ExtraGrammatical", "Interjection", "SASTA", "Syntax", "None", "oh ja sorry ja ik ben eerst uh naar een ziekenhuis geweest een aantal weken", "ik ben eerst naar een ziekenhuis geweest een aantal weken"], ["7", "ExtraGrammatical", "Repeated ja, nee, nou", "SASTA", "Syntax", "Repetition", "oh ja sorry ja ik ben eerst uh naar een ziekenhuis geweest een aantal weken", "ik ben eerst naar een ziekenhuis geweest een aantal weken"], ["7", "ExtraGrammatical", "ja, nee or nou filled pause", "SASTA", "Syntax", "None", "oh ja sorry ja ik ben eerst uh naar een ziekenhuis geweest een aantal weken", "ik ben eerst naar een ziekenhuis geweest een aantal weken"], ["8", "ExtraGrammatical", "Repeated word token", "SASTA", "Tokenisation", "Repetition", "toen een aantal weken in een een iets van zorg ", "toen een aantal weken in een iets van zorg"], ["9", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "ik weet niet uh in uh(.)", "ik weet niet in uh( . )"], ["10", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "uh buiten Breda ", "buiten is Breda"], ["13", "ExtraGrammatical", "Repeated word token", "SASTA", "Tokenisation", "Repetition", "en en toen ik zo ver weer was ben ik naar hier gekomen", "en toen ik zo ver weer was ben ik naar hier gekomen"], ["14", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "dat heet de uh ZORGINSTELLING1", "dat heet de ZORGINSTELLING1"], ["15", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "uh (..) BEROEP1", "BEROEP1"], ["16", "ExtraGrammatical", "ja, nee or nou filled pause", "SASTA", "Syntax", "None", "ja is heel erg leuk ja", "is heel erg leuk"], ["16", "ExtraGrammatical", "ja, nee or nou filled pause", "SASTA", "Syntax", "None", "ja is heel erg leuk ja", "is heel erg leuk"], ["20", "ExtraGrammatical", "ja, nee or nou filled pause", "SASTA", "Syntax", "None", "en is ook leuk kon kinderen zo gezellig zo lief zo fijn ja", "en is ook leuk kon kinderen zo gezellig zo lief zo fijn"], ["21", "ExtraGrammatical", "ja, nee or nou filled pause", "SASTA", "Syntax", "None", "ja en ook ouders heel goed contact", "en ook ouders heel goed contact"], ["22", "ExtraGrammatical", "Short Repetition", "SASTA", "Tokenisation", "Repetition", "ki kinderen worden gebracht", "kinderen worden gebracht"], ["23", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "en uh dan is het contact goed met de ouders", "en dan is het contact goed met de ouders"], ["25", "ExtraGrammatical", "ja, nee or nou filled pause", "SASTA", "Syntax", "None", "en kinderen zijn ja fijn lief", "en kinderen zijn fijn lief"], ["26", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "en voelen zich wel uh voelen zich wel gelukkig bij ons", "en voelen zich wel gelukkig bij ons"], ["26", "ExtraGrammatical", "Word token of a repeated word token sequence", "SASTA", "Tokenisation", "Repetition", "en voelen zich wel uh voelen zich wel gelukkig bij ons", "en voelen zich wel gelukkig bij ons"], ["26", "ExtraGrammatical", "Word token of a repeated word token sequence", "SASTA", "Tokenisation", "Repetition", "en voelen zich wel uh voelen zich wel gelukkig bij ons", "en voelen zich wel gelukkig bij ons"], ["26", "ExtraGrammatical", "Word token of a repeated word token sequence", "SASTA", "Tokenisation", "Repetition", "en voelen zich wel uh voelen zich wel gelukkig bij ons", "en voelen zich wel gelukkig bij ons"], ["28", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "oo uh uh ik werk drie dagen", "ik werk drie dagen"], ["28", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "oo uh uh ik werk drie dagen", "ik werk drie dagen"], ["28", "ExtraGrammatical", "Interjection", "SASTA", "Syntax", "None", "oo uh uh ik werk drie dagen", "ik werk drie dagen"], ["29", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "ja ik begin ik om uh half acht tot ik denk tot zes uur ja", "ik begin ik om half acht tot ik denk tot zes uur"], ["29", "ExtraGrammatical", "ja, nee or nou filled pause", "SASTA", "Syntax", "None", "ja ik begin ik om uh half acht tot ik denk tot zes uur ja", "ik begin ik om half acht tot ik denk tot zes uur"], ["29", "ExtraGrammatical", "ja, nee or nou filled pause", "SASTA", "Syntax", "None", "ja ik begin ik om uh half acht tot ik denk tot zes uur ja", "ik begin ik om half acht tot ik denk tot zes uur"], ["30", "ExtraGrammatical", "ja, nee or nou filled pause", "SASTA", "Syntax", "None", "ja toevallig hierachter ", "toevallig hierachter"], ["31", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "uh kinderen met beperking", "kinderen met beperking"], ["33", "ExtraGrammatical", "Repeated ja, nee, nou", "SASTA", "Syntax", "Repetition", "(.) ja (.) ja ik weet het niet", "ik weet het niet"], ["33", "ExtraGrammatical", "ja, nee or nou filled pause", "SASTA", "Syntax", "None", "(.) ja (.) ja ik weet het niet", "ik weet het niet"], ["35", "ExtraGrammatical", "Repeated word token", "SASTA", "Tokenisation", "Repetition", "ik ik herken het", "ik herken het"], ["36", "ExtraGrammatical", "Interjection", "SASTA", "Syntax", "None", "wauw daar ben ik veel geweest", "daar ben ik veel geweest"], ["37", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "ook hier de uh", "ook hier de"], ["40", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "veel geweest voor uh", "veel geweest voor"], ["42", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "ja uh net gekoppeld", "net gekoppeld"], ["42", "ExtraGrammatical", "ja, nee or nou filled pause", "SASTA", "Syntax", "None", "ja uh net gekoppeld", "net gekoppeld"], ["44", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "dus maar ben ik veel geweest maar ook als ik hier uh", "dus maar ben ik veel geweest maar ook als ik hier"], ["45", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "oo (..) uh ja uh uh (..) ik zing met uh oudere mensen uh", "ik zing met oudere mensen"], ["45", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "oo (..) uh ja uh uh (..) ik zing met uh oudere mensen uh", "ik zing met oudere mensen"], ["45", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "oo (..) uh ja uh uh (..) ik zing met uh oudere mensen uh", "ik zing met oudere mensen"], ["45", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "oo (..) uh ja uh uh (..) ik zing met uh oudere mensen uh", "ik zing met oudere mensen"], ["45", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "oo (..) uh ja uh uh (..) ik zing met uh oudere mensen uh", "ik zing met oudere mensen"], ["45", "ExtraGrammatical", "Interjection", "SASTA", "Syntax", "None", "oo (..) uh ja uh uh (..) ik zing met uh oudere mensen uh", "ik zing met oudere mensen"], ["45", "ExtraGrammatical", "ja, nee or nou filled pause", "SASTA", "Syntax", "None", "oo (..) uh ja uh uh (..) ik zing met uh oudere mensen uh", "ik zing met oudere mensen"], ["46", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "ik uh (..) uh ik doe uh boekjes voor uh club geloof ik uh", "ik doe boekjes voor club geloof ik"], ["46", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "ik uh (..) uh ik doe uh boekjes voor uh club geloof ik uh", "ik doe boekjes voor club geloof ik"], ["46", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "ik uh (..) uh ik doe uh boekjes voor uh club geloof ik uh", "ik doe boekjes voor club geloof ik"], ["46", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "ik uh (..) uh ik doe uh boekjes voor uh club geloof ik uh", "ik doe boekjes voor club geloof ik"], ["46", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "ik uh (..) uh ik doe uh boekjes voor uh club geloof ik uh", "ik doe boekjes voor club geloof ik"], ["46", "ExtraGrammatical", "Repeated word token", "SASTA", "Tokenisation", "Repetition", "ik uh (..) uh ik doe uh boekjes voor uh club geloof ik uh", "ik doe boekjes voor club geloof ik"], ["47", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "en uh (...) uh (.) uh sorry", "en"], ["47", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "en uh (...) uh (.) uh sorry", "en"], ["47", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "en uh (...) uh (.) uh sorry", "en"], ["47", "ExtraGrammatical", "Interjection", "SASTA", "Syntax", "None", "en uh (...) uh (.) uh sorry", "en"]]}' -@pytest.fixture -def cha_testfiles_dir(): - return op.join(settings.BASE_DIR, 'test_files') - - -@pytest.fixture -def tarsp_category(db): - obj = MethodCategory.objects.create(name='TARSP', zc_embeddings=True, levels=['Sz', 'Zc', 'Wg', 'VVW'], marking_postcodes=['[+ G]']) - yield obj - obj.delete() - - -@pytest.fixture -def stap_category(db): - obj = MethodCategory.objects.create(name='STAP', zc_embeddings=False, levels=['Complexiteit', 'Grammaticale fout'], marking_postcodes=['[+ G]', '[+ VU]']) - yield obj - obj.delete() - - -@pytest.fixture -def asta_category(db): - obj = MethodCategory.objects.create(name='ASTA', zc_embeddings=False, levels=[ - "Samplegrootte", - "MLU", - "Taalmaat", - "Foutenanalyse", - "Lemma" - ], marking_postcodes=["[+ G]"]) - yield obj - obj.delete() - - -@pytest.fixture -def method_dir(): - return op.join(sd_settings.SD_DIR, 'data', 'methods') - - -@pytest.fixture -def tarsp_method(db, tarsp_category, method_dir): - file = glob.glob(f'{method_dir}/TARSP Index Current.xlsx')[0] - with open(file, 'rb') as f: - wrapped_file = File(f) - instance = AssessmentMethod(name='tarsp_test_method', category=tarsp_category) - instance.content.save(op.basename(file), wrapped_file) - yield instance - instance.delete() - - -@pytest.fixture -def asta_method(db, asta_category, method_dir): - file = glob.glob(f'{method_dir}/ASTA Index Current.xlsx')[0] - with open(file, 'rb') as f: - wrapped_file = File(f) - instance = AssessmentMethod(name='asta_test_method', category=asta_category) - instance.content.save(op.basename(file), wrapped_file) - yield instance - instance.delete() - - @pytest.fixture def tarsp_corpus(db, admin_user, tarsp_method, tarsp_category): obj = Corpus.objects.create( diff --git a/backend/conftest.py b/backend/conftest.py new file mode 100644 index 00000000..075d0622 --- /dev/null +++ b/backend/conftest.py @@ -0,0 +1,73 @@ +import pytest +from django.conf import settings +from analysis.models import MethodCategory, AssessmentMethod +from os import path as op +from sastadev.conf import settings as sd_settings +import glob +from django.core.files import File + + +@pytest.fixture +def cha_testfiles_dir(): + return op.join(settings.BASE_DIR, 'test_files') + + +@pytest.fixture +def tarsp_category(db): + obj = MethodCategory.objects.create( + name='TARSP', zc_embeddings=True, levels=[ + 'Sz', 'Zc', 'Wg', 'VVW'], marking_postcodes=['[+ G]']) + yield obj + obj.delete() + + +@pytest.fixture +def stap_category(db): + obj = MethodCategory.objects.create( + name='STAP', zc_embeddings=False, levels=[ + 'Complexiteit', 'Grammaticale fout'], marking_postcodes=['[+ G]', '[+ VU]']) + yield obj + obj.delete() + + +@pytest.fixture +def asta_category(db): + obj = MethodCategory.objects.create( + name='ASTA', zc_embeddings=False, levels=[ + "Samplegrootte", + "MLU", + "Taalmaat", + "Foutenanalyse", + "Lemma" + ], marking_postcodes=["[+ G]"]) + yield obj + obj.delete() + + +@pytest.fixture +def method_dir(): + return op.join(sd_settings.SD_DIR, 'data', 'methods') + + +@pytest.fixture +def tarsp_method(db, tarsp_category, method_dir): + file = glob.glob(f'{method_dir}/TARSP Index Current.xlsx')[0] + with open(file, 'rb') as f: + wrapped_file = File(f) + instance = AssessmentMethod( + name='tarsp_test_method', category=tarsp_category) + instance.content.save(op.basename(file), wrapped_file) + yield instance + instance.delete() + + +@pytest.fixture +def asta_method(db, asta_category, method_dir): + file = glob.glob(f'{method_dir}/ASTA Index Current.xlsx')[0] + with open(file, 'rb') as f: + wrapped_file = File(f) + instance = AssessmentMethod( + name='asta_test_method', category=asta_category) + instance.content.save(op.basename(file), wrapped_file) + yield instance + instance.delete() diff --git a/backend/convert/chat_reader_test.py b/backend/convert/chat_reader_test.py index d1f9c851..36a8fbda 100644 --- a/backend/convert/chat_reader_test.py +++ b/backend/convert/chat_reader_test.py @@ -1,14 +1,12 @@ import os.path as op import pytest -from analysis.conftest import stap_category, tarsp_category from .chat_reader import ChatDocument from .conftest import TEST_DIR @pytest.mark.django_db -@pytest.mark.usefixtures("tarsp_category") def test_chat_reader(chafiles, tarsp_category): for (input, _) in chafiles: doc = ChatDocument.from_chatfile(input, tarsp_category) @@ -16,7 +14,6 @@ def test_chat_reader(chafiles, tarsp_category): @pytest.mark.django_db -@pytest.mark.usefixtures("tarsp_category", "stap_category") def test_marking_postcodes(chafiles, tarsp_category, stap_category): inf = op.join(TEST_DIR, 'TD16.cha') diff --git a/backend/convert/chat_writer_test.py b/backend/convert/chat_writer_test.py index 997453a0..9c7532e2 100644 --- a/backend/convert/chat_writer_test.py +++ b/backend/convert/chat_writer_test.py @@ -1,12 +1,10 @@ import pytest -from analysis.conftest import tarsp_category from .chat_reader import ChatDocument from .chat_writer import ChatWriter @pytest.mark.django_db -@pytest.mark.usefixtures("tarsp_category") def test_chat_writer(chafiles, tarsp_category): for inpath, outpath in chafiles: doc = ChatDocument.from_chatfile(inpath, tarsp_category) From b7a99d9e2a29283b27604d27e419d4c53d8578ca Mon Sep 17 00:00:00 2001 From: Jelte van Boheemen Date: Wed, 17 Jan 2024 16:53:12 +0100 Subject: [PATCH 04/36] First version of new SAF writer --- backend/analysis/models.py | 39 +++-- backend/analysis/views.py | 13 +- backend/annotations/__init__.py | 0 backend/annotations/apps.py | 6 + backend/annotations/conftest.py | 11 ++ backend/annotations/constants.py | 8 + backend/annotations/utils.py | 56 +++++++ backend/annotations/writer.py | 160 ++++++++++++++++++++ backend/annotations/writer_tests.py | 50 ++++++ backend/conftest.py | 45 +++++- backend/sasta/common_settings.py | 2 +- backend/setup.cfg | 2 +- backend/test_files/single_utt.cha | 11 ++ backend/test_files/single_utt.xml | 133 ++++++++++++++++ backend/test_files/single_utt_corrected.xml | 129 ++++++++++++++++ backend/test_files/single_utt_saf.xlsx | Bin 0 -> 28405 bytes 16 files changed, 641 insertions(+), 24 deletions(-) create mode 100644 backend/annotations/__init__.py create mode 100644 backend/annotations/apps.py create mode 100644 backend/annotations/conftest.py create mode 100644 backend/annotations/constants.py create mode 100644 backend/annotations/utils.py create mode 100644 backend/annotations/writer.py create mode 100644 backend/annotations/writer_tests.py create mode 100644 backend/test_files/single_utt.cha create mode 100644 backend/test_files/single_utt.xml create mode 100644 backend/test_files/single_utt_corrected.xml create mode 100644 backend/test_files/single_utt_saf.xlsx diff --git a/backend/analysis/models.py b/backend/analysis/models.py index 117d469c..81616b38 100644 --- a/backend/analysis/models.py +++ b/backend/analysis/models.py @@ -4,7 +4,7 @@ import zipfile from io import BytesIO from itertools import chain -from typing import List, Optional, Tuple +from typing import Dict, List, Tuple from uuid import uuid4 from analysis.annotations.utils import clean_item @@ -14,7 +14,9 @@ from django.db import models from lxml import etree as ET from sastadev.external_functions import form_map +from sastadev.methods import Method from sastadev.query import Query +from sastadev.readmethod import read_method logger = logging.getLogger('sasta') @@ -68,7 +70,8 @@ def upload_path(self, filename): content = models.FileField( upload_to=upload_path, blank=True, null=True, max_length=500) category = models.ForeignKey( - MethodCategory, related_name='definitions', blank=True, null=True, on_delete=models.CASCADE) + MethodCategory, related_name='definitions', + blank=True, null=True, on_delete=models.CASCADE) def __str__(self): return self.name @@ -84,6 +87,11 @@ def get_item_mapping(self, sep): mapping.update(q.get_item_mapping(sep)) return mapping + def to_sastadev(self) -> Method: + cat_name = self.category.name.lower() + location = self.content.path + return read_method(cat_name, location) + class Corpus(models.Model): user = models.ForeignKey( @@ -94,9 +102,12 @@ class Corpus(models.Model): date_added = models.DateField(auto_now_add=True) date_modified = models.DateField(auto_now=True) default_method = models.ForeignKey(AssessmentMethod, - on_delete=models.SET_NULL, related_name='corpora', blank=True, null=True) + on_delete=models.SET_NULL, + related_name='corpora', + blank=True, null=True) method_category = models.ForeignKey( - MethodCategory, on_delete=models.SET_DEFAULT, default=1, related_name='corpora') + MethodCategory, on_delete=models.SET_DEFAULT, + default=1, related_name='corpora') def __str__(self): return self.name @@ -176,9 +187,11 @@ def get_utterance_by_id(self, utt_id: int): except Exception: raise - def get_filepaths(self) -> Tuple[str]: + def get_filepaths(self) -> Tuple: if self.corrected_content: - return (self.content.path, self.parsed_content.path, self.corrected_content.path) + return (self.content.path, + self.parsed_content.path, + self.corrected_content.path) return (self.content.path, self.parsed_content.path) @property @@ -229,15 +242,17 @@ def for_analysis(self): def word_elements(self) -> List[ET._Element]: '''List of word elements, sorted by word (begin, end)''' word_elements = self.syntree.findall('.//node[@word]') - return sorted(word_elements, key=lambda x: (int(x.attrib.get('begin')), int(x.attrib.get('end')))) + return sorted(word_elements, key=lambda x: (int(x.attrib.get('begin')), + int(x.attrib.get('end')))) @property @functools.lru_cache(maxsize=128) - def word_position_mapping(self) -> List[Tuple[Optional[int], Optional[int]]]: + def word_position_mapping(self) -> List[Dict]: ''' List of dictionaries (begin, end) for each word in the utterance starts with { begin:None, end:None } to represent unaligned ''' - mapping = [{'begin': int(el.attrib.get('begin')), 'end': int(el.attrib.get('end'))} + mapping = [{'begin': int(el.attrib.get('begin')), + 'end': int(el.attrib.get('end'))} for el in self.word_elements] return [{'begin': None, 'end': None}] + mapping @@ -354,12 +369,14 @@ def upload_path(self, filename): transcript = models.ForeignKey( Transcript, related_name='analysisruns', on_delete=models.CASCADE) method = models.ForeignKey( - AssessmentMethod, related_name='analysisruns', on_delete=models.CASCADE) + AssessmentMethod, related_name='analysisruns', + on_delete=models.CASCADE) created = models.DateTimeField(auto_now_add=True) query_file = models.FileField(upload_to=upload_path, max_length=500) annotation_file = models.FileField(upload_to=upload_path, max_length=500) is_manual_correction = models.BooleanField( - default=False, help_text='this run was generated by parsing a user-uploaded SAF-file') + default=False, + help_text='this run was generated by parsing a user-uploaded SAF-file') class Meta: get_latest_by = "created" diff --git a/backend/analysis/views.py b/backend/analysis/views.py index aaee2b0a..42b344e2 100644 --- a/backend/analysis/views.py +++ b/backend/analysis/views.py @@ -9,6 +9,7 @@ from analysis.annotations.safreader import SAFReader from analysis.query.run import query_transcript from analysis.query.xlsx_output import annotations_to_xlsx, querycounts_to_xlsx +from annotations.writer import SAFWriter from celery import group from convert.chat_writer import ChatWriter from django.db.models import Q @@ -25,8 +26,9 @@ from .models import (AnalysisRun, AssessmentMethod, Corpus, MethodCategory, Transcript, UploadFile) from .permissions import IsCorpusChildOwner, IsCorpusOwner -from .serializers import (AssessmentMethodSerializer, CorpusDetailsSerializer, CorpusListSerializer, - MethodCategorySerializer, TranscriptDetailsSerializer, +from .serializers import (AssessmentMethodSerializer, CorpusDetailsSerializer, + CorpusListSerializer, MethodCategorySerializer, + TranscriptDetailsSerializer, TranscriptListSerializer, UploadFileSerializer) from .utils import StreamFile @@ -104,7 +106,12 @@ def annotate(self, request, *args, **kwargs): transcript, method, True, zc_embed ) - spreadsheet = annotations_to_xlsx(allresults, method) + # spreadsheet = annotations_to_xlsx(allresults, method) + writer = SAFWriter(method.to_sastadev(), allresults) + writer.make_workbook() + spreadsheet = writer.workbook + + self.create_analysis_run(transcript, method, spreadsheet) format = request.data.get('format', 'xlsx') diff --git a/backend/annotations/__init__.py b/backend/annotations/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/backend/annotations/apps.py b/backend/annotations/apps.py new file mode 100644 index 00000000..ecaea75e --- /dev/null +++ b/backend/annotations/apps.py @@ -0,0 +1,6 @@ +from django.apps import AppConfig + + +class AnnotationsConfig(AppConfig): + default_auto_field = 'django.db.models.BigAutoField' + name = 'annotations' diff --git a/backend/annotations/conftest.py b/backend/annotations/conftest.py new file mode 100644 index 00000000..66638b95 --- /dev/null +++ b/backend/annotations/conftest.py @@ -0,0 +1,11 @@ + +import pytest + +from annotations.writer import SAFWriter + + +@pytest.fixture +def safwriter(asta_method, single_utt_allresults): + sd_method = asta_method.to_sastadev() + writer = SAFWriter(sd_method, single_utt_allresults) + return writer diff --git a/backend/annotations/constants.py b/backend/annotations/constants.py new file mode 100644 index 00000000..e2538e60 --- /dev/null +++ b/backend/annotations/constants.py @@ -0,0 +1,8 @@ +SAF_COMMENT_LEVEL = 'Commentaar' +SAF_COMMENT_COLUMN = 'Commentaar' +SAF_UNALIGNED_LEVEL = 'Hele uiting' +SAF_UTT_LEVEL = 'Utt' +PRE_WORDS_HEADERS = ['ID', 'Level', SAF_UNALIGNED_LEVEL] +POST_WORDS_HEADERS = ['Fases', SAF_COMMENT_LEVEL] +PRIMARY_COLOR = '3f51b5' +SECONDARY_COLOR = 'b5a33f' diff --git a/backend/annotations/utils.py b/backend/annotations/utils.py new file mode 100644 index 00000000..1195f571 --- /dev/null +++ b/backend/annotations/utils.py @@ -0,0 +1,56 @@ +from typing import Any, List + +from annotations.constants import PRIMARY_COLOR, SAF_UTT_LEVEL +from openpyxl.styles import Font, PatternFill +from openpyxl.styles.protection import Protection +from openpyxl.utils import get_column_letter +from openpyxl.worksheet.dimensions import ColumnDimension, DimensionHolder +from sastadev.allresults import AllResults + + +def get_max_words(allresults: AllResults) -> int: + '''Get the length of the longest utterance in the results''' + return max(len(v) for v in allresults.allutts.values()) + + +def ljust(li: List[Any], n: int, fillvalue=None) -> List[Any]: + '''Pad the list with fillvalues up to N''' + return li + [fillvalue] * (n - len(li)) + + +def format_worksheet(worksheet) -> None: + '''Locks all cells except annotation fields. + Gives utterance rows a primary background. + ''' + + # start by locking the entire sheet + worksheet.protection.sheet = True + unlocked = Protection(locked=False) + + header = worksheet["1:1"] + for cell in header: + # bold headers + cell.font = Font(bold=True) + + # yelow background for each utterance row + for row in list(worksheet.rows)[1:]: + if row[1].value == SAF_UTT_LEVEL: + for cell in row: + cell.font = Font(color='FFFFFF') + cell.fill = PatternFill( + start_color=PRIMARY_COLOR, + end_color=PRIMARY_COLOR, + fill_type="solid") + else: + # unlock non-utterance rows + # skip the first two columns (utt number and level) + for cell in row[2:]: + cell.protection = unlocked + + +def autosize_columns(worksheet) -> None: + dim_holder = DimensionHolder(worksheet=worksheet) + for col in range(worksheet.min_column, worksheet.max_column + 1): + dim_holder[get_column_letter(col)] = ColumnDimension( + worksheet, min=col, max=col, auto_size=True) + worksheet.column_dimensions = dim_holder diff --git a/backend/annotations/writer.py b/backend/annotations/writer.py new file mode 100644 index 00000000..4bed732f --- /dev/null +++ b/backend/annotations/writer.py @@ -0,0 +1,160 @@ +import itertools +from dataclasses import dataclass, field +from typing import Dict, List, TextIO, Tuple + +from analysis.models import MethodCategory +from openpyxl import Workbook +from openpyxl.cell.cell import Cell +from openpyxl.worksheet.worksheet import Worksheet +from sastadev.allresults import AllResults, ResultsKey +from sastadev.methods import Method +from sastadev.sastatypes import ExactResults + +from .constants import (POST_WORDS_HEADERS, PRE_WORDS_HEADERS, + SAF_COMMENT_LEVEL, SAF_UTT_LEVEL) +from .utils import autosize_columns, format_worksheet, get_max_words, ljust + + +@dataclass +class SAFWriter(): + method: Method + results: AllResults + workbook: Workbook = field(init=False, default_factory=Workbook) + anno_ws: Worksheet = field(init=False) + method_category: MethodCategory = field(init=False) + # Number of non-word columns, counted from 0 + word_offset: int = field(default=len(PRE_WORDS_HEADERS), init=False) + # Number of words of the longest utterance in the results + max_words: int = field(init=False) + # Offset for row (1 - len(levels)) below utt row (0) + level_offsets: Dict[str, int] = field(init=False) + # Number of rows each utterance takes up (utt + level rows) + utt_n_rows: int = field(init=False) + # header row + anno_headers: List[str] = field(init=False) + + def __post_init__(self) -> None: + self.max_words = get_max_words(self.results) + self.method_category = MethodCategory.objects.get( + name=self.method.name.upper()) + + all_levels = [SAF_UTT_LEVEL, + *self.method_category.levels, + SAF_COMMENT_LEVEL] + self.level_offsets = { + level.lower(): index + for (index, level) + in enumerate(all_levels) + } + self.utt_n_rows = (len(all_levels)) + self.anno_headers = self._annotations_header_row() + + def write(self, target: TextIO) -> None: + '''Write the completed output file''' + self.workbook.save('/Users/a3248526/Documents/sasta_saf_test.xlsx') + + def make_workbook(self) -> None: + '''Create the complete workbook. + Any additional required sheets should created in this method. + ''' + _ = self._make_annotations_worksheet() + format_worksheet(self.anno_ws) + autosize_columns(self.anno_ws) + errors_ws = self.workbook.create_sheet('error', 1) + errors_ws.cell(1, 1).value = 'Hier komen errors' + + def _make_annotations_worksheet(self) -> Worksheet: + '''Transform results into a SAF-formatted worksheet''' + self.anno_ws = self.workbook.create_sheet(title='annotations', index=0) + # Headers + self.anno_ws.append(self.anno_headers) + + # Rest + self._make_levels_rows(self.anno_ws) + + # Fill with values + for qid, qresults in self.results.exactresults.items(): + self._fill_query(qid, qresults) + return self.anno_ws + + def _annotations_header_row(self) -> List[str]: + '''Create header row with correct number of word columns''' + word_headers = [f'Word{i}' for i in range(1, self.max_words + 1)] + + return list(itertools.chain( + PRE_WORDS_HEADERS, + word_headers, + POST_WORDS_HEADERS + )) + + def _uttlevel_row(self, id: int, words: List[str]) -> List[str]: + '''Create utterance level row''' + pre_word_values = [id, SAF_UTT_LEVEL, None] # Unaligned + word_values = ljust(words, self.max_words) + post_word_values = [None, None] # Fases, Commentaar + return list(itertools.chain( + pre_word_values, + word_values, + post_word_values + )) + + def _make_levels_rows(self, ws: Worksheet) -> None: + '''Create rows for all utterances, all levels''' + row_size = len(self.anno_headers) + all_levels = self.method_category.levels + [SAF_COMMENT_LEVEL] + + for utt_id, words in sorted(self.results.allutts.items(), + key=lambda x: x[0]): + ws.append(self._uttlevel_row(utt_id, words)) + for level in all_levels: + level_row = ljust([utt_id, level], row_size) + ws.append(level_row) + + def _fill_query(self, query_id: ResultsKey, exact_results: ExactResults): + '''Find and fill all cells for a single query''' + query = self.method.queries.get(query_id) + item = query.item + fase = query.fase + + for utt_id, word_nr in exact_results: + # We cannot assume that utterances are numbered 1-N sequentially + utt_nr = list(self.results.allutts.keys()).index(utt_id) + row, col = self._cell_location(utt_nr, query.level, word_nr) + cell = self.anno_ws.cell(row, col) + self._append_item(cell, item) + if fase: + self._append_fase(row, str(fase)) + + def _cell_location(self, utt_nr: int, level: str, + word_nr: int) -> Tuple[int, int]: + '''Find the coordinates of a cell''' + return ( + self._uttlevel_row_number(utt_nr, level), + self._word_col_number(word_nr) + ) + + def _uttlevel_row_number(self, utt_nr: int, level: str) -> int: + '''Calculate the row number for level of utterance (1 indexed)''' + total = 1 # header and 1-indexed offsets + utt_offset = (utt_nr * self.utt_n_rows) + 1 + level_offset = self.level_offsets.get(level.lower(), 0) + total += utt_offset + level_offset + return total + + def _word_col_number(self, word_nr: int) -> int: + '''Calculate the column number for a word''' + return word_nr + len(PRE_WORDS_HEADERS) + + def _append_item(self, cell: Cell, item: str) -> None: + cell.value = item if not cell.value else f'{cell.value}, {item}' + + def _append_fase(self, row: int, fase: str) -> None: + fase_cell = self.anno_ws.cell(row, len(self.anno_headers) - 1) + sep = ', ' + if not fase_cell.value: + fase_cell.value = fase + else: + current = set(fase_cell.value.split(sep)) + current.add(fase) + new = sep.join(sorted(list(current))) + fase_cell.value = new diff --git a/backend/annotations/writer_tests.py b/backend/annotations/writer_tests.py new file mode 100644 index 00000000..0f3cc87f --- /dev/null +++ b/backend/annotations/writer_tests.py @@ -0,0 +1,50 @@ +from annotations.writer import SAFWriter + +from .utils import ljust + + +def test_safwriter(safwriter: SAFWriter): + safwriter.make_workbook() + safwriter.write('/Users/a3248526/Documents/saf_writer_test.xlsx') + assert safwriter + + +def test_headers(safwriter: SAFWriter): + found = safwriter._annotations_header_row() + expected = ['ID', 'Level', 'Hele uiting', + 'Word1', 'Word2', 'Word3', 'Word4', + 'Word5', 'Word6', 'Word7', 'Word8', + 'Word9', 'Word10', 'Word11', 'Word12', + 'Word13', 'Word14', 'Word15', 'Word16', + 'Word17', 'Word18', + 'Fases', 'Commentaar'] + assert found == expected + + +def test_uttlevel_row(safwriter: SAFWriter): + id = 1 + words = safwriter.results.allutts[id] + found = safwriter._uttlevel_row(id, words) + expected = [1, 'Utt', None, 'ja', 'uh', 'ik', 'vind', 'het', 'beetje', + 'moeilijk', 'om', 'het', 'goed', 'te', 'vertellen', 'want', + 'ik', 'heb', 'een', 'ongeluk', 'gehad', None, None] + assert found == expected + + +def test_ljust_list(): + input = ['a', 'b', 'c'] + ljustified = ljust(input, 5) + assert ljustified == input + [None, None] + ljustified = ljust(input, 3) + assert ljustified == input + ljustified = ljust(input, 2) + assert ljustified == input + + +def test_uttlevel_offset(safwriter: SAFWriter): + assert safwriter._uttlevel_row_number(0, 'Samplegrootte') == 3 + assert safwriter._uttlevel_row_number(0, 'Taalmaat') == 5 + assert safwriter._uttlevel_row_number(0, 'Commentaar') == 8 + assert safwriter._uttlevel_row_number(2, 'Samplegrootte') == 17 + assert safwriter._uttlevel_row_number(2, 'Taalmaat') == 19 + assert safwriter._uttlevel_row_number(2, 'Commentaar') == 22 diff --git a/backend/conftest.py b/backend/conftest.py index 075d0622..966f58d2 100644 --- a/backend/conftest.py +++ b/backend/conftest.py @@ -1,10 +1,13 @@ +import glob +from collections import Counter +from os import path as op + import pytest +from analysis.models import AssessmentMethod, MethodCategory from django.conf import settings -from analysis.models import MethodCategory, AssessmentMethod -from os import path as op -from sastadev.conf import settings as sd_settings -import glob from django.core.files import File +from sastadev.allresults import AllResults +from sastadev.conf import settings as sd_settings @pytest.fixture @@ -15,8 +18,9 @@ def cha_testfiles_dir(): @pytest.fixture def tarsp_category(db): obj = MethodCategory.objects.create( - name='TARSP', zc_embeddings=True, levels=[ - 'Sz', 'Zc', 'Wg', 'VVW'], marking_postcodes=['[+ G]']) + name='TARSP', zc_embeddings=True, + levels=['Sz', 'Zc', 'Wg', 'VVW'], + marking_postcodes=['[+ G]']) yield obj obj.delete() @@ -24,8 +28,9 @@ def tarsp_category(db): @pytest.fixture def stap_category(db): obj = MethodCategory.objects.create( - name='STAP', zc_embeddings=False, levels=[ - 'Complexiteit', 'Grammaticale fout'], marking_postcodes=['[+ G]', '[+ VU]']) + name='STAP', zc_embeddings=False, + levels=['Complexiteit', 'Grammaticale fout'], + marking_postcodes=['[+ G]', '[+ VU]']) yield obj obj.delete() @@ -71,3 +76,27 @@ def asta_method(db, asta_category, method_dir): instance.content.save(op.basename(file), wrapped_file) yield instance instance.delete() + + +@pytest.fixture +def single_utt_allresults(): + return AllResults( + uttcount=1, + coreresults={'A029': Counter({1: 1}), 'A045': Counter({1: 1}), + 'A001': Counter({1: 1}), 'A003': Counter({1: 2}), + 'A013': Counter({1: 1}), 'A018': Counter({1: 2}), + 'A021': Counter({1: 2}), 'A024': Counter({1: 2})}, + exactresults={'A029': [(1, 1)], 'A045': [(1, 2)], 'A001': [(1, 7)], + 'A003': [(1, 8), (1, 13)], 'A013': [(1, 4)], + 'A018': [(1, 12), (1, 18)], 'A021': [(1, 6), (1, 17)], + 'A024': [(1, 4), (1, 15)]}, + postresults={'A046': Counter(), 'A049': Counter()}, + allmatches=None, # Not provided in this fixture + filename='single_utt', + analysedtrees=[(1, None)], + annotationinput=False, + allutts={1: ['ja', 'uh', 'ik', 'vind', 'het', 'beetje', 'moeilijk', + 'om', 'het', 'goed', 'te', 'vertellen', 'want', 'ik', + 'heb', 'een', 'ongeluk', 'gehad']} + + ) diff --git a/backend/sasta/common_settings.py b/backend/sasta/common_settings.py index bc42acbc..134b191a 100644 --- a/backend/sasta/common_settings.py +++ b/backend/sasta/common_settings.py @@ -25,7 +25,7 @@ 'authentication', 'parse', 'convert', - 'sastadev' + 'annotations' ] MIDDLEWARE = [ diff --git a/backend/setup.cfg b/backend/setup.cfg index 6b8a7c62..344cc0c5 100644 --- a/backend/setup.cfg +++ b/backend/setup.cfg @@ -11,7 +11,7 @@ ignore_missing_imports = True exclude = sastadev .env -ignore = E501, W503 +ignore = W503 max-complexity = 10 # pytest fixtures need this funky import stye, hence the ignore F401 and F811 per-file-ignores = diff --git a/backend/test_files/single_utt.cha b/backend/test_files/single_utt.cha new file mode 100644 index 00000000..774e30c9 --- /dev/null +++ b/backend/test_files/single_utt.cha @@ -0,0 +1,11 @@ +@UTF8 +@Begin +@Languages: nld +@Participants: PMA pma Other, INV inv Other +@ID: nld||PMA|||||Other||| +@ID: nld||INV|||||Other||| +@Comment: ##META text samplenaam = ASTA-16 + +*INV: Kun u mij eens wat vertellen waarom u hier bent wat is er gebeurd +*PMA: ja uh ik vind het beetje moeilijk om het goed te vertellen want ik heb een ongeluk gehad +%xsid: 1 diff --git a/backend/test_files/single_utt.xml b/backend/test_files/single_utt.xml new file mode 100644 index 00000000..5c98c69b --- /dev/null +++ b/backend/test_files/single_utt.xml @@ -0,0 +1,133 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Kun u mij eens wat vertellen waarom u hier bent wat is er gebeurd + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/backend/test_files/single_utt_corrected.xml b/backend/test_files/single_utt_corrected.xml new file mode 100644 index 00000000..a10d7eb2 --- /dev/null +++ b/backend/test_files/single_utt_corrected.xml @@ -0,0 +1,129 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Kun u mij eens wat vertellen waarom u hier bent wat is er gebeurd + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ja uh ik vind het beetje moeilijk om het goed te vertellen want ik heb een ongeluk gehad + Q#ng1704292912|ik vind het beetje moeilijk om het goed te vertellen want ik heb een ongeluk gehad|1|1|-12.662841177789993 + + \ No newline at end of file diff --git a/backend/test_files/single_utt_saf.xlsx b/backend/test_files/single_utt_saf.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..9046b16bf644fb480f9902bd1f51b28dc042459d GIT binary patch literal 28405 zcmeIb2|SeR`#)~UmTcMgEn5;v6xp{KJE4@NWGzd!>?RZyGEpR+{j8$1-zY&vjq-`~AMI>wf0-JON~`jK@tn-F z-OYXelCy6e@?4x>gj}sHHQE&sT>r|%l3hl;rfy20f|)hO{iGT4J_Ebb0ah!kD7!?0 zJ8p$lMp9}Ym9=u%92l7tE`3VssWYWR)E4i`I+<~#zPIv&p6LMH;l|81+E1tbav3gN zDYm@K>!_@gwb1jT6#?b}eigzuB2WhC5@|*UE3s3_O2ND#@b?4;=OYT39!t732UaUG zrs_0(e$CUrD)5n>qZ+|(Q<5AFwb5hYbKdviQC+f{{t=nIAwlW&4Mm4v=}({K2yjVj zB^%nIzKms9q#G}pf2Bg(@QQa&VEFWh`f)k`sg~hWp)<2_;}FSpQUU@jmYBfcJ6b+B zk>E!GSknPmM+wl<%H7tYc^oL{DiPW<=FJ&u&0GG_a|bT6Cqu3Y2#FiSC8mS!4nwkJ*@4!p9Iu9@h+^e znH+OC;`r1$*h~ox93EK~7GMoeP2*ET6~n`%&b^cHl04_p!r2I5rm0})68iG7smCD&g@rtKMeK&L z#j3rFq$wf3+^zGh6yT#&z67mf?YyGPmFkP+b1&EIbba5QuS`Yn8s^B>=E^!G9}wor z6JD>$4H1d)7=D;gYJ&Xesj}KAQqeoNuL*^;8)~TQKmaTKN66Il=BVEQ;6enbE)DPw z05Y4w0yDO7?U7&zOew&gUXl0W;CW=ITC>vu)O)$}d0&#B4fLrS7Wz!^09R!SCGBR0F zl?N4@pU=O0zD$MSzSDJ&(X7j~O&*4l{%N8Qc=B4(Y^Yz@ft_r3G&$|REK0fi${8Ik zLB;{zJtOige$Sxhky5$Z5BjL9uZA4C@9JUJ&GdK(87!J$)?xC>%Z#wvoC)sCZV;)J ztP`)rE-kLK7`E6_`--zlRh8x8;vVuO4ihbxf(}Y=(iR&{w3;Fn#Uo9T_xDCV1cEHA zt5Z>jjy{!~B{Qob%ULFg3`@~SJ^i>=lwuDu)S-;Hkep^ea~`d?+$5i=NyUkAQIpfg z5|gzFo_yM~N1PT;a+FDUEj*4RQ4@)}#LUVub@brV;2s)Ep-5{z^Yg>6&iPNIJU0mq zOgb-d04=uO2`68^_)3L`Orw@wy71tLyKy34%}{5E!fRQ>LPUgh-mM!aXzS|l&E7U( zb-O=RDfBw#c$iK443q3*xm&t)E$|?;QSkw_`t| z>%D4v5haN{1;!>(TBQmW*G(CfegQ%f6m+oQnZ5XX?|eCt&czz4pa|(DBU`_I;hq+&gB>dRNEDtJs2$dxr&Z`?z&#v!C=#a z{2j|{wM&L$Z-dUwMcf3h(b?T?O$(Z?I4rvShDULQTl%7q^D5UG@t;BWJR@tJ8y7hxZH^mq1Gpg%-Cn#FtoP@;o{*MVtDrI zK8oWpX+Em#0dawAP$|2r@wc3hNC_`H83owu$j@Waic&+QgtF3hmm6zQTA$859(3%y zR%5Np?!M@d(^HhM_sr1RJv>aRMjKf1q)g@7U6vO%_Y$J2UV|9&n{FH5-FL*mxr$E2 zfc=Ko<&$Od)n?pD{3jlmr0(?~Dk{&~YuZxhnZDjGRKrC9MT9Zb2GMd>4@}7LMFr?` z-x1?}7o!DkyRY*25!h{qn)6P(B_j;97_)8Yg&g3`xO^9b`hO_DV&=oc=%BA+MHIV}sHE~^$$`xD5KHs*$_J3p_2 zxOkJVB?~_9jRINBN8R=lALPHIl(>s{jh#~OC>N#CGEwJq(=evAQb%6h1(&Fzy)&8F z!MYbtm5GK2GOhD_^jb<&a?WKBUN}b>xJ%84r!wgdw2aAUt)rZoqhYFImN~OfjYB6>sR8O#M0z)&oADO3w1=+6d=jiQB zdt7|(Lr^H)<3rlBxe+Vrpp{M&rWKM%I91U#!xS7x|}<$9@v^nvt+ zQp}NS=7&{@K3YA!O6$7#f`NvKGtj$&aGH0?QHolj?#zebbmg(ATAM-L+Vp4fb!Q4# zqDPqBxM+&%Tm#aH6nB*}H^)TVhfy>p^mU#)_22>H^&L&jA)gCxe_q0B${N)JRNMEZN)c^2XM5SGJEU7E<;<+JSIo$ufMN5#+vmev3A^`w43#N zt?L|_7U$yWyo-56C;;Cg|HI8Y30TiUb>jM97bdD<3wtCP)(_bQMh zZHBgeO00UpQ_~-zVLKKMHJtX)dfD{hz%e87%Q_qm@apa}Pg^sx=-m0_bndIZIZ62W z+OlEFQ3FEj6MKDHckUgb&vCM}sAR7ZOpBWmo1RS2f0p~6Hp-MrpN_^(C%f$AeB)wd zIibY!tCJT?lU%$snh7qjgru3qNxnVlTGo>c-xoa9rQVaIRc7QdD}X*vDJj!CeX-1} z-ufCF6m#;_tK8#k&mWtRxhgSPS-tA&IhCYWRml#mkzx8=Ci+(6Z4JA@YX|9*O6S8D z{YVyAZR)#kFqcz*UeF>;N#$X)Ykg$4%Qr2q=HzvxM=S9qQpp^Lu=YJdsvIf|-WOzP z8P1dXgnodtmohY75qL}4V9;XzJcPz$7wduarHPfL++;;{;Qx`J<*DV5`gPGzX9jySNorhDVz*O7-`P)x^sK=6dsm#LLmB zB_x;shf_o}jh_{Cuc;N*)AfvsjM?q_q@_nX7$AqHNP9eU>kE)h%(F(d<58+FrGy5hMG`;+<`ar`~6*MO|-C*ks*P|O2-j2dWL#kJ14BKnl&&Mcm z9uAg56sR8i+*F-@u&C$R+Zl`YkvHiJ(nmaRLTT^#UPjW7+?kc-61|58ExI)}l2%+Z z8Kj>(&G%aJi72CwtYBoSiy90yT`Z=Ah-q^@opIc<5Sa&_;JfmWpZjg%oq0+3#AsOx zJ&)WsQreC0X@RS6Sqel;; z>$q5A{YKdMlXs1epR99b>yY-fBzgfK))}i4K83S~d--W6BgmRdOIDxEkYZO#O83P# zpL8B2VY)lyYvoijh~H`-CH$G%k$8#T1lxlFG4us*4}%}Pm5x1Q(Y)&g7bxeVuT6Yc zWB$j;$OMM72O)Z_j~$8ZZC_j*p1x!9*jW4RNS5iLhY3cb8PdmC1Bx=(_PxuYupkc^ zAj#>wL}$@Noljx62Y#{8W4>MN?!uK4klQ6P!K@M%zo?5fXh& zN8>r)=yFT0Vz1z^IOw)J3%1#~b(F%KIY+&Rn740eXQXHO3!?qTuOI9jrl9gC+Fz_Q zeaRts@`MkSGRacINCi(gy5j8-?JC$IoxpP`wIGAT(Z+n_-Jy9l50MPk9fMPOd(4#U z)r`UB38$Ws4MCrlKMAG^d_g80Y#`GC6DB70mR#=}l2F(vh^%=cdVW_DjuD!47`AnHlK}7 zh*>;`B{ZL{ymbb$=h9H&yF;gBus|d7#}?%TS3>U%eFB2QXd(iZZ!L<4gRQNnhxo>~ zjofrEw$~+9f;J3!d_wL-In4;vRAgsnmT9YLPeey@Nn!1-%Ii^Py+ijq%2#iId2hSZ zGX@_xyN1EO|1?~sESg6=33H^pt9C%3W#_Smi^p106g%@t1m4ZR7;G#sXyHGMep<3p z(8pftMRiFCbgy&hXhuE%(8K&{dKed9-)bR3E5qknP^c#a{1&XFS8N8tomACcd>>Y%X5+{62BU zkwDOhIW)9Ju11L6Hf;GyOx$ZW`*Ym92}wt-(jd(0_s zH%wP+H9wsSeOgMv6xTFTma`Kr6I5!qtfxYslIS+YZ6PLlO^zdvk7*)AL2w1RFGA#$ zSolt~z!~Jg!nM(1Zv&pA_q9~6$@DX4VCaQSS`Lw_)Wp>~RlQ+e6VjlQc|*=;?o%Av zhRAK;$X;*0&yLP$5Nut-w7&Aon9Gg3)mIs7n4338Rr0FMSoy90aC4=4K|TDiHoVvI zb7D7ywAk^eUgem6u_=da2rn~&|jXVrA+ zp&IHPyL<`j919P2W20lET)EZLeS_n;-}8^B=zIv7@=2+wz?WEWA84M>6B( zyJ9i(2uK?i-P_hwid-L^o0}JhtVUx4R~HJcf-tL!kbtFiMDhRzRV7~73#s6QU}#rd z;SdaJJsgQ$Xrn=3-OU3&)(Dh?d}bM}Ud|34DKDBE1b+3MWwWw*xlC&{^m3WSis9ul zgVn8<%N$n5Gy@0xk4=&ZEk2Q^(VKF1=DzoSwMYK&Gnp5V3)aY~&mXD>dS)xHl@+#y z())Xi0&kqzf7Cb6+dB|=d1euKxgS&6kb<09@_WDTGnoADs9WJDZbX0bD+tC0k|kb& zg&;7qQVM2|hn8ZI(WCRzm31e|CqwC*;uQ`*3+TrEpa}dxqMs~gwE~#Sn5euqoQ*{{ z!;ktf`Umxp37NwDFIW?z$%L3rFHaJD3OzonIaPU0(V9kYjqX!Jgrapt%G$bP(UB=a zHJYaRgz({&k7pmoFMlHRovny+$;Eyu9rcDB<)Q9E@cPiL4uLUeCByq3ToC5m8{YH4 z&4E)keBjIl7EZzN?lW#DIHkkoF9{cx6Rs>GH7vPCIzGO{tMAy z&R>pSnoQ`+rSX^)SbsAO!Sq`yHcnsg0&(Ucc$b^sGf(v?rulq6u*mo_;DWVPayAgu z;o?QC`==IJPa4aP@`pb%1O5Ga>Mp_JlNw>h*$^h<-wIQn#^ebp+ z6fvmXkN|WU+G>AcSF{3#;|X=9>IhVhf%=i^uz{Qu^<&l17P(#2sZ~P~a{H*Wt45xZ z(biym1ZV2P-4=F3YM)o2VtXew-8k_Z=qWgpzAT6YrRUN!u^&IZuuOE13aGVRTec(05;TK+c#`6w`U3kqIulF2_1P|ll zm+Y;B%5I!fiBYx0uHB?^zXF5qic#GOqR8Yq1Pd;JQo~hA>&X*%4%!6gK&kS=cWahM zSs@ho5b1dzZk8483BP<3DFbO_lo8nzIs_#$QrQcla^w+%g`%K@k}87ryUch*Z9=CW zRNr)^e_N&fHC@ewlwC`#ux$I zn87*HVihA0X#+0_?6NVG4X$(&L^jS#Y;#!$%9^KST2I=odW|2U0dUVbY_zSxY^|H)O-$b^7S-J&^ z`vfBK8>0n`3A9;pcLP&X@FQGbr^H$q1sMO_JwC;xlP}9HH6xh!9NBYq7asFTIr%dA zsgH7u>g)OwFE7NZNP?(Ecm!Y}?NCy0RBctW`(l{D&iB=Az&TycWR-#R>(Su;to zG26(<8p>Y-SYOOi|HMn42HrD6tC}Q=D*s=e!DgRj5tx^CPdEc~w~_KW5V0dKCoFsy zlulCFuAaz@m)(X&;6}i?S-;nVB)By`JNx=qP%<1EJr>c`HR&)LvIi$TyziGPsPNoJ z{_NHxsnCEgQ}FlwVZAs&mE5k@4Jh!bK7lgeK`rL5lMSlF5BlO?*9u=?R^kmzw;=$R zluY6vBHvJR-e2YsfQDau`522YZMW(BCvQAeEo5t|_FL%jz?piG{BxhcvKv5j+(ERg zc%)|lm}S;m0p|4$=kW*Tm-pjMs^eS4S?(5bR8Lp$*j z<1fCX;3MAG*FFO2a6$GYE}z1U-cF7Yq{TG71`DlE%4RWg1H^@Jg#`=65@@O zTHb?4YWpj5#hc{6#s1f&g4#{RqZn4Tt`C3#jbK+Ns1ma%=ZZHK` zRd1or*^YUrXEx9HlAGq}0pQRV=&1!~N#c0dZaDsT(5J-D?5v|%3aa{WvF zjeq;I9cO^K&dTA}ouKhAVmI2o<}IoM{^lRn$1kM|=C1k{_}<(NXCK$n0<($;F~VYu zZ=N$!JqPB@6rzINl)7mLS2+o0cZ{x|Na=aKA9*0`rZJ*UyrYgw!zi%AC{U=w8_xp` zV1cb+Kk``&VBz?Ijyi+wSUA}X_>clvfGF!M%*vhJ)3GFqw+r4zm`ge9nP$^-T%~N_ zg+R??fjZ~6W1$SN5Xuf%IDIV8$z?khcxUFgv^f9^zQ+P1-L_-l3Sfbr6TE=YT1Z0C zr^HUkWtQWAAhTJ}*Y%EHdOrioLY_*RI9tx2S#Gse3(kJ=ODqe31^(oTvt3)Zz|d83 zuNPgOTz86FxR`?>vIHh7B47)5-+MhtOpJ^&SbKNSOs?Vg(Sd+;1)pl8%Y2LATHjgUqI#? z8Y6(Ecg2%An;G__BVe32TRafwp$j?3)G(_~47gb!61Qw2$ykvGw-S2o70nuk!2FiV zw&=r#*Iyk8w2DE=SHw;rM9|low`^yVC$0HN7`j|$ynA!RevAIk|MvH+%^+HT^hJ5@ z=xisM)_$@pRx8TKEg!)mZ^iC30%w5^W%53U-Dr;`g@f*b4m$EahD8-<+AO%rwG=9@ z>MwN*2P`UQA&tp|J&)v7`h+|g$sf5(S-^I?ni{zlDR#HO|%+8M>2VC z!>$|0vKeVlfaD!{qF~pgVwvDtJs^KI+)4zl=3}R{;qjE91>|uY>wiwEKfJn0y8vpW z;AHfdQv4?w2}oT4Wv-zC!r)e+T;@;S^KG_&{?le?Z9kCTJF-Pw{uw6zd2p`Az*3g) z8DsTY^#is(p-nVN{*o1%w?lh3OuWY_MAy8}B(Xqz=xh5e;L|-N3V1&b6=gt?n7g&i zwkeft1Kt~i*o5)_e-sLcc$j!Rvk*>#Ew)-D%4Kfm$^Rr6r?x>#ZkVpk=tQEG@J{l% zza@G9HlTaJRqk5>=(lpI+}tt=$I-u*4##kH5+V^b_2(M-Gv@wJgMSdmP0II|3xvO@ z!8Hqto4cw`G7I0^F_K1ldqD!3d>yc`_83_seKrukBi~!tm4X;)IM+>Fg@jn2f)q7t zd^{cm1&SopR-5xLz4dl`BsBKcdvJ_^W(-NCC@--I$?3%TqxwsV;E41G*WR3v89yc1 zOW|0(bUZKWCh>tD1x;*YGRy0F%VCYAS<8XJe% z$0qCh9T*>m9gnrt_d7T~4fBp2&__#*e}d`9D(Isn$0uQ~vE};c1LLF0?G+dABPuWz zolXUa_4`{r`NQ?+#Qhl+cB!cVAePebj57{E>l(N-voBc0a4Dj-v(_JCB3um?0!@An zt8df({mh>haUt_s-PSU1J>0*aciS`Wa`J4)A6FakkomjLiX8X_Z+{Okjx&g{9PCor zV`SI7$0R#ysI{U1GE3zrZ6%+AvW7Q-a|O$iQFicNFjBBA1$72q2Tl%5OCU9P>YhHiXTN8-Q5%C*qaZz7phbK<0jMG&hFd1kySS z`Hpoz*M@IW@IT@EDKpnKG%?|dF3y3IE6)~m{pPR*n2{8e6MP;lE?9OCbq+oWb`>m3 zLbg znOe=Tt>2R4N2?xJNK5|8#0rc#)>6;!(D(vOCU!v2Pi7ngqls0}Lraa%!^C6D_0ZDe zt1#}^ATC^SjmGps3hYubi)28(IX(`XSk1x{WZzW#Us#*_?7`_?_J3-rzSZ^rmV!@q zQn9ZW9VD+wiv$6-k3p=wpGp@&PGc_XaMlJSo8wXm+CjkdM zMf2*Kqj?9#82%eP+mUzwx+nM)f$W}N)C<@H&7x?+V)J~4M2 z8}{Zv;kPSm-Op9i@x+qekz4h=zk*EcS16JA}vT@Tv6df{EB;)3)2cE+}LzyE?}2FAncXdv1^8?`nO zi}yb8H#&_K0oQ3}bo=q%dHSa(jJw8(HzdDw9GIwgXzbs!Dbf5jHEv6OHr5J@YkB*& z74HA=MULA?}m-WTI%>68Sj8~ z#SZBB$&L5JmSPoj(1*v{V9l}RI%wJPUKlbqNC)jpi@SJV=!$Jt1g>RchJg!U+k)3u zW^C~){J(hn2)h!F+u?H!`o1Im8Mij|*#DkRH)(;p|M%3m&VllT=YyMt%I=`t;o0C^ zp|S+jMR+L~DHM1IqyKl?_3LZ#xQ(Ock?bvs*ODvt%zyWu z1Qv4ZeUny|0R!;)xSKDPB%WnJZFo9Y`~4E)j4n8-rXX%>pzb~Qf{u4dbLOO|6kbCM zP9$cyJ}m#~4-}c}7s&1!K#sE8F6g!|mHqV1e>C`YIPeMI*b+%Y2yBWwWh)kfRI(KN zf(6?Z_mKSa`Chr0bZNZ(>PpkpnWHit>MIzEAS#`y%4)7H z9d^UH!kStx%X450$&%+w!}kK#lx+Pao{`aS?En%+sLD@s)%Rl816h*i^TV?ixh`6@ zF^zK(y`5CDzxj^eLaijylW|-Yt_$l0dEcDjmsj(PL@|sXf%Cz&VO=2Cn*#^gmJD1J zIw#jXyTwbzH*Tu7amW(^>F6)$?$c_1BWhX?))jqbBH40K#>7atQK$Kmht?dW)9qci z8Di$NwdOo3-$b@o-)iTBp1VI-u(r0FMHBH(qvB|VaOq+GL(i+JH!S}6_y+67MeJ4J zR&HqEsVWS>JAm7adpMkNx3xLu>F(%azj4cEr(1@u^apm;A#N^Hrys3)VrIrVHY4~x z!?;s7_JmAzV%I@$k!#2H7pqjr%S@AW2cg7Y%wvW0vLY%fnaMa}B8=JnB@dForC;1A z>2aEgcWu_aDQQ5du)A~QYJ<5{u9b<%uyxiW<3xo~OkJK%FBdT^wn+1?X;3Y%Qu6Bu zqHRG+4#%SJgo^9i=c}{6x+(I)4Q8HL)5pMHjNrE{zm#9>m^E~FuS|A!)*D?D1x<0A zC^8g>3;D}Q@>AuXS;9NdeKrpDpek?skU+}J7LZ%%K^0_B(`=mOmzO1yU6Pk2mW|HK zI*=`!|GKAx<@L?P6DqH(Kicjh6d~R<800bHKz1)uUTB6{h)mO3!;j%@N}5?euQ)Yb zlDAx|6lrp|`jP8qs6c8h%sQduuslhl@F0era#PC7>!fFp&)BWs81;IW zcfa+kYL&l4L!dITkXTBV?B7%=M6HlpzJB6b!2n~{!yoY2Pgtbx1h1rPDFuI{#9pN8|lM+pFs4u~<-u99H5 zf*`@FHpGTF$9PdwIg=e*l;9vYWk9I77c%xr?Q!d0FLNyIpsK0s^OzgHryVo?WN)Y# z%c=Ig>Qa2d1On&GGQ6%^&ind=6HOMyyv$*QIRLkG&2NfOa}Yd7+?>{V+MH1O^B&uT z7LlIA>8!P;PWK0;=ctEkOq}=5thMGm8ySqvC5$Fd_^^{BzR#_XNL8M;l~APSQoB{! z6oJuc;Ri&+`_kxY6fMcp@*tLDx9IInipmy~sNM9tjZlLh2D+^!q;=^JxiMFbrY{Gs zpx`D{lpQl#1dY2z?nu0{aL#*CF@S8`k+Q3oB(AUI5n>^W;7G51z8|skqr}s6B&VK) zWRu*YenP+2JGo+!v`z;Nrm08EY7UD9`MenwX1{63VZ;`*dapK8kEM9kA@yFrEAdk3 z1nn4P@bI2wmQhsP%>(?X%RW17EN15QV*>y5#_#kFcg4d1NRk1s(r$+2nF|;Ghe!a5 zU%&3fyv5xE|1y#UO-+P6FkC>sGt}B(kC?lI0rAC{`M8s2Y7RypTy~V|BW_He_&f*Z{{O}%y|1T}SNt49LeJF1fS zsTBw9)X~S?hsk~THQ$pGJ!o|$_6V1#E+kE9xSM=jURJM|2YZU>rE!fo567#$9jc48 zW&~v2?U7z%SMvk4RuWFC38tz`P}R9VN$Oh9x41zh^6pA#a=^C zO5YhwmTv`1@|qjE>!R;-Va3~XVfhKIu8Qy1H||`DON4|(VnEXS^9PLmwpPD=|C7g! z{b=C(M-+!xu$Uk1{P@g}Z!zl|83|hP;~(3{qb(~KN|S)PQu>|#4!Hu_#f^o{G*j0 zYng9WOl`l+;K$15M+@KQ)NkVfIkP7q_(O*MarpQ8w|RJ~ NOF$C0a>D)g{{Z$ox;6j+ literal 0 HcmV?d00001 From c8bac054d86449a8ec6d9db35a67bca0ac3cf3ad Mon Sep 17 00:00:00 2001 From: Jelte van Boheemen Date: Thu, 18 Jan 2024 11:44:12 +0100 Subject: [PATCH 05/36] Use new constants for reading annotation files --- backend/analysis/annotations/safreader.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/backend/analysis/annotations/safreader.py b/backend/analysis/annotations/safreader.py index 91650b9a..496dd2ec 100644 --- a/backend/analysis/annotations/safreader.py +++ b/backend/analysis/annotations/safreader.py @@ -4,11 +4,11 @@ import pandas as pd from analysis.models import Transcript +from annotations.constants import SAF_COMMENT_LEVEL, SAF_UNALIGNED_LEVEL, SAF_UTT_LEVEL from .annotation_format import (SAFAnnotation, SAFDocument, SAFUtterance, SAFWord) -from .constants import (LABELSEP, PREFIX, SAF_COMMENT_LEVEL, - SAF_UNALIGNED_LEVEL, UTTLEVEL) +from .constants import (LABELSEP, PREFIX) from .utils import (clean_item, clean_row, enrich, getlabels, item2queryid, mkpatterns, standardize_header_name) @@ -30,7 +30,8 @@ class UnalignedWord(Exception): def get_word_levels(data: pd.DataFrame): levels = data.level - filtered_levels = levels[~levels.isin([SAF_COMMENT_LEVEL.lower(), UTTLEVEL.lower()])] + filtered_levels = levels[~levels.isin( + [SAF_COMMENT_LEVEL.lower(), SAF_UTT_LEVEL.lower()])] return list(filtered_levels.unique()) @@ -45,7 +46,7 @@ def word_level_data(word_data: pd.DataFrame, colname: str): raise UnalignedWord elif word_data.empty: raise NoWordDataException - utt_data = word_data.loc[word_data.level == UTTLEVEL, colname] + utt_data = word_data.loc[word_data.level == SAF_UTT_LEVEL, colname] return utt_data @@ -73,14 +74,15 @@ def loaddata(self, filepath): data = pd.read_excel(filepath, engine='openpyxl') data.rename(columns=standardize_header_name, inplace=True) data = data.where(data.notnull(), None) - self.word_cols = [SAF_UNALIGNED_LEVEL.lower()] + list(filter(is_word_column, data.columns)) + self.word_cols = [SAF_UNALIGNED_LEVEL.lower()] + \ + list(filter(is_word_column, data.columns)) # Do we need to drop empty columns? Seems we don't. If otherwise, make sure word_columns are not dropped # data.dropna(how='all', axis=1, inplace=True) relevant_cols = ['utt_id', 'level'] + self.word_cols self.levels = [lv for lv in list( - data.level.dropna().unique()) if lv.lower() != UTTLEVEL] + data.level.dropna().unique()) if lv.lower() != SAF_UTT_LEVEL] data = data[relevant_cols].apply(clean_row, axis='columns') From 342baf49736e3c0bf011bced5bc5dd8721a5214d Mon Sep 17 00:00:00 2001 From: Jelte van Boheemen Date: Wed, 24 Jan 2024 13:05:21 +0100 Subject: [PATCH 06/36] Use SASTADEV labels for SAFReader --- backend/annotations/constants.py | 24 ++++++++++++++++++------ backend/annotations/utils.py | 18 +++++++++++++----- backend/annotations/writer_tests.py | 2 +- 3 files changed, 32 insertions(+), 12 deletions(-) diff --git a/backend/annotations/constants.py b/backend/annotations/constants.py index e2538e60..dadde2fe 100644 --- a/backend/annotations/constants.py +++ b/backend/annotations/constants.py @@ -1,8 +1,20 @@ -SAF_COMMENT_LEVEL = 'Commentaar' -SAF_COMMENT_COLUMN = 'Commentaar' -SAF_UNALIGNED_LEVEL = 'Hele uiting' -SAF_UTT_LEVEL = 'Utt' -PRE_WORDS_HEADERS = ['ID', 'Level', SAF_UNALIGNED_LEVEL] -POST_WORDS_HEADERS = ['Fases', SAF_COMMENT_LEVEL] +from annotations.utils import preflabel +from sastadev.SAFreader import (commentsheaders, stagesheaders, + unalignedheaders, uttidheaders, levelheaders) + +SAF_COMMENT_LEVEL = preflabel(commentsheaders, str.capitalize) +SAF_COMMENT_COLUMN = preflabel(commentsheaders, str.capitalize) + +SAF_UTT_LEVEL = preflabel(uttidheaders, str.capitalize) +SAF_UNALIGNED_LEVEL = preflabel(unalignedheaders, str.capitalize) + +SAF_LEVEL_HEADER = preflabel(levelheaders, str.capitalize) + +SAF_FASES_COLUMN = preflabel(stagesheaders, str.capitalize) + +# Composed headers +PRE_WORDS_HEADERS = ['ID', SAF_LEVEL_HEADER, SAF_UNALIGNED_LEVEL] +POST_WORDS_HEADERS = [SAF_FASES_COLUMN, SAF_COMMENT_COLUMN] + PRIMARY_COLOR = '3f51b5' SECONDARY_COLOR = 'b5a33f' diff --git a/backend/annotations/utils.py b/backend/annotations/utils.py index 1195f571..fa3cd833 100644 --- a/backend/annotations/utils.py +++ b/backend/annotations/utils.py @@ -1,6 +1,6 @@ -from typing import Any, List +from typing import Any, List, Optional -from annotations.constants import PRIMARY_COLOR, SAF_UTT_LEVEL +from annotations import constants from openpyxl.styles import Font, PatternFill from openpyxl.styles.protection import Protection from openpyxl.utils import get_column_letter @@ -8,6 +8,14 @@ from sastadev.allresults import AllResults +def preflabel(labels: List[str], casing: Optional[callable] = None) -> str: + try: + label = labels[0] + return casing(label) if casing else label + except IndexError: + return '' + + def get_max_words(allresults: AllResults) -> int: '''Get the length of the longest utterance in the results''' return max(len(v) for v in allresults.allutts.values()) @@ -34,12 +42,12 @@ def format_worksheet(worksheet) -> None: # yelow background for each utterance row for row in list(worksheet.rows)[1:]: - if row[1].value == SAF_UTT_LEVEL: + if row[1].value == constants.SAF_UTT_LEVEL: for cell in row: cell.font = Font(color='FFFFFF') cell.fill = PatternFill( - start_color=PRIMARY_COLOR, - end_color=PRIMARY_COLOR, + start_color=constants.PRIMARY_COLOR, + end_color=constants.PRIMARY_COLOR, fill_type="solid") else: # unlock non-utterance rows diff --git a/backend/annotations/writer_tests.py b/backend/annotations/writer_tests.py index 0f3cc87f..07e621c8 100644 --- a/backend/annotations/writer_tests.py +++ b/backend/annotations/writer_tests.py @@ -25,7 +25,7 @@ def test_uttlevel_row(safwriter: SAFWriter): id = 1 words = safwriter.results.allutts[id] found = safwriter._uttlevel_row(id, words) - expected = [1, 'Utt', None, 'ja', 'uh', 'ik', 'vind', 'het', 'beetje', + expected = [1, 'Uiting', None, 'ja', 'uh', 'ik', 'vind', 'het', 'beetje', 'moeilijk', 'om', 'het', 'goed', 'te', 'vertellen', 'want', 'ik', 'heb', 'een', 'ongeluk', 'gehad', None, None] assert found == expected From 58d70bf96de5ae480438f093cc178e21fe955841 Mon Sep 17 00:00:00 2001 From: Jelte van Boheemen Date: Wed, 24 Jan 2024 15:04:11 +0100 Subject: [PATCH 07/36] Adapt SAFReader to new SAFWriter --- backend/analysis/annotations/safreader.py | 11 ++++++----- backend/annotations/constants.py | 3 +++ 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/backend/analysis/annotations/safreader.py b/backend/analysis/annotations/safreader.py index 496dd2ec..0b335601 100644 --- a/backend/analysis/annotations/safreader.py +++ b/backend/analysis/annotations/safreader.py @@ -4,11 +4,12 @@ import pandas as pd from analysis.models import Transcript -from annotations.constants import SAF_COMMENT_LEVEL, SAF_UNALIGNED_LEVEL, SAF_UTT_LEVEL +from annotations.constants import (SAF_COMMENT_LEVEL, SAF_UNALIGNED_LEVEL, + SAF_UNALIGNED_LEVELS, SAF_UTT_LEVEL, SAF_UTT_LEVELS) from .annotation_format import (SAFAnnotation, SAFDocument, SAFUtterance, SAFWord) -from .constants import (LABELSEP, PREFIX) +from .constants import LABELSEP, PREFIX from .utils import (clean_item, clean_row, enrich, getlabels, item2queryid, mkpatterns, standardize_header_name) @@ -42,11 +43,11 @@ def is_word_column(column_name: str) -> bool: def word_level_data(word_data: pd.DataFrame, colname: str): '''returns combination word/level ''' - if colname.lower() == SAF_UNALIGNED_LEVEL.lower(): + if colname.lower() in SAF_UNALIGNED_LEVELS: raise UnalignedWord elif word_data.empty: raise NoWordDataException - utt_data = word_data.loc[word_data.level == SAF_UTT_LEVEL, colname] + utt_data = word_data.loc[word_data.level.isin(SAF_UTT_LEVELS), colname] return utt_data @@ -82,7 +83,7 @@ def loaddata(self, filepath): relevant_cols = ['utt_id', 'level'] + self.word_cols self.levels = [lv for lv in list( - data.level.dropna().unique()) if lv.lower() != SAF_UTT_LEVEL] + data.level.dropna().unique()) if lv.lower() not in SAF_UTT_LEVELS] data = data[relevant_cols].apply(clean_row, axis='columns') diff --git a/backend/annotations/constants.py b/backend/annotations/constants.py index dadde2fe..f53e6ecd 100644 --- a/backend/annotations/constants.py +++ b/backend/annotations/constants.py @@ -6,7 +6,10 @@ SAF_COMMENT_COLUMN = preflabel(commentsheaders, str.capitalize) SAF_UTT_LEVEL = preflabel(uttidheaders, str.capitalize) +SAF_UTT_LEVELS = list(map(str.lower, uttidheaders)) + SAF_UNALIGNED_LEVEL = preflabel(unalignedheaders, str.capitalize) +SAF_UNALIGNED_LEVELS = list(map(str.lower, unalignedheaders)) SAF_LEVEL_HEADER = preflabel(levelheaders, str.capitalize) From b5d71aa68483f19e6279e42a6fb3c4c199dfe3b4 Mon Sep 17 00:00:00 2001 From: Jelte van Boheemen Date: Tue, 30 Jan 2024 11:21:07 +0100 Subject: [PATCH 08/36] Adapt SAFReader to new constants --- backend/analysis/annotations/constants.py | 24 ++++++++++++------- backend/analysis/annotations/safreader.py | 24 ++++++++++++------- .../analysis/annotations/safreader_test.py | 9 +++++-- backend/annotations/constants.py | 12 +++++++--- backend/annotations/writer_tests.py | 10 ++++---- 5 files changed, 53 insertions(+), 26 deletions(-) diff --git a/backend/analysis/annotations/constants.py b/backend/analysis/annotations/constants.py index 7340b3c5..1865c5f1 100644 --- a/backend/analysis/annotations/constants.py +++ b/backend/analysis/annotations/constants.py @@ -1,5 +1,9 @@ from typing import Counter, Dict, Optional, Tuple +from annotations.constants import (SAF_COMMENT_COLUMN, SAF_COMMENT_HEADERS, SAF_COMMENT_LEVEL, SAF_FASES_COLUMN, + SAF_FASES_HEADERS, SAF_LEVEL_HEADER, SAF_LEVEL_HEADERS, + SAF_SPEAKER_COLUMNS, SAF_SPEAKER_HEADER, SAF_UNALIGNED_LEVEL, SAF_UNALIGNED_LEVELS, SAF_UTT_HEADER, SAF_UTT_LEVELS) + # Type annotations TupleStrDict = Dict[Tuple[Optional[str], Optional[str]], str] CounterDict = Dict[str, Counter[str]] @@ -7,19 +11,21 @@ # Global ITEMSEPPATTERN = r'[,-; ]' LABELSEP = ',' -UTTLEVEL = 'utt' + HEADER_VARIANTS = { - 'speaker': ['speaker', 'spreker', 'spk'], - 'utt_id': ['id', 'utt', 'uttid'], - 'level': ['level'], - 'phase': ['fases', 'stages'], - 'comments': ['comments', 'commentaar'] + SAF_UTT_HEADER.lower(): SAF_UTT_LEVELS, + SAF_SPEAKER_HEADER.lower(): SAF_SPEAKER_COLUMNS, + SAF_UNALIGNED_LEVEL.lower(): SAF_UNALIGNED_LEVELS, + SAF_LEVEL_HEADER.lower(): SAF_LEVEL_HEADERS, + SAF_FASES_COLUMN.lower(): SAF_FASES_HEADERS, + SAF_COMMENT_COLUMN.lower(): SAF_COMMENT_HEADERS + } + + PREFIX = "" ALTITEMSEP = IMPLIESSEP = ',' -SAF_COMMENT_LEVEL = 'Commentaar' -SAF_UNALIGNED_LEVEL = 'Unaligned' # Define (lowercased) levels that should not be cleaned # Currently, only comment rows should be excempt -NO_CLEAN_LEVELS = (SAF_COMMENT_LEVEL.lower(), ) +NO_CLEAN_LEVELS = [*SAF_COMMENT_HEADERS] diff --git a/backend/analysis/annotations/safreader.py b/backend/analysis/annotations/safreader.py index 0b335601..b60dd31f 100644 --- a/backend/analysis/annotations/safreader.py +++ b/backend/analysis/annotations/safreader.py @@ -4,8 +4,9 @@ import pandas as pd from analysis.models import Transcript -from annotations.constants import (SAF_COMMENT_LEVEL, SAF_UNALIGNED_LEVEL, - SAF_UNALIGNED_LEVELS, SAF_UTT_LEVEL, SAF_UTT_LEVELS) +from annotations.constants import (SAF_COMMENT_HEADERS, SAF_UNALIGNED_LEVEL, + SAF_UNALIGNED_LEVELS, SAF_UTT_HEADER, + SAF_UTT_LEVELS) from .annotation_format import (SAFAnnotation, SAFDocument, SAFUtterance, SAFWord) @@ -32,7 +33,7 @@ class UnalignedWord(Exception): def get_word_levels(data: pd.DataFrame): levels = data.level filtered_levels = levels[~levels.isin( - [SAF_COMMENT_LEVEL.lower(), SAF_UTT_LEVEL.lower()])] + [*SAF_COMMENT_HEADERS, *SAF_UTT_LEVELS])] return list(filtered_levels.unique()) @@ -81,11 +82,18 @@ def loaddata(self, filepath): # Do we need to drop empty columns? Seems we don't. If otherwise, make sure word_columns are not dropped # data.dropna(how='all', axis=1, inplace=True) - relevant_cols = ['utt_id', 'level'] + self.word_cols + relevant_cols = [ + *SAF_UTT_LEVELS, + 'level', + *SAF_UNALIGNED_LEVELS, + *self.word_cols + ] + to_clean_cols = [col for col in set( + relevant_cols) if col in data.columns] self.levels = [lv for lv in list( data.level.dropna().unique()) if lv.lower() not in SAF_UTT_LEVELS] - data = data[relevant_cols].apply(clean_row, axis='columns') + data = data[to_clean_cols].apply(clean_row, axis='columns') return data @@ -96,8 +104,8 @@ def make_mappings(self): return item_mapping, patterns def get_annotations(self, data): - for utt_id in data.utt_id.unique(): - utt_rows = data[data.utt_id == utt_id] + for utt_id in data[SAF_UTT_HEADER.lower()].unique(): + utt_rows = data[data[SAF_UTT_HEADER.lower()] == utt_id] parsed_utterance = self.parse_utterance(utt_id, utt_rows) self.document.utterances.append(parsed_utterance) @@ -147,7 +155,7 @@ def parse_word(self, utt_id, word_id, colname, word_data, wordposmap): level, utt_id, word_id, text) # read comments - comment_data = data.loc[data.level == SAF_COMMENT_LEVEL.lower()].dropna() + comment_data = data.loc[data.level.isin(SAF_COMMENT_HEADERS)].dropna() if not comment_data.empty: instance.comment = str(comment_data[colname].iloc[0]) diff --git a/backend/analysis/annotations/safreader_test.py b/backend/analysis/annotations/safreader_test.py index 7600cce3..47d4157e 100644 --- a/backend/analysis/annotations/safreader_test.py +++ b/backend/analysis/annotations/safreader_test.py @@ -63,7 +63,8 @@ def test_astalex(asta_method, asta_transcript, asta_transcript_corrections, cha_ def test_wordlevels(): - data = {'level': map(str.lower, ['Utt', 'QA', 'SZ', 'Grammaticale Fout', 'Commentaar']), + data = {'level': map(str.lower, ['Uiting', 'QA', 'SZ', + 'Grammaticale Fout', 'Opmerkingen']), 'word1': [1, None, 'X', 'V, BvBB', 'Hier staat wat commentaar']} df_in = DataFrame.from_dict(data) @@ -72,7 +73,11 @@ def test_wordlevels(): def test_read_saf_comments(tarsp_method, tarsp_transcript, cha_testfiles_dir): - reader = SAFReader(op.join(cha_testfiles_dir, 'sample_5_SAF_with_comments.xlsx'), tarsp_method, tarsp_transcript) + reader = SAFReader( + op.join( + cha_testfiles_dir, + 'sample_5_SAF_with_comments.xlsx'), + tarsp_method, tarsp_transcript) sent = reader.document.utterances[3] assert sent.words[1].comment == 'Ik vind hier iets van.' assert sent.words[2].comment == '1' diff --git a/backend/annotations/constants.py b/backend/annotations/constants.py index f53e6ecd..614e355c 100644 --- a/backend/annotations/constants.py +++ b/backend/annotations/constants.py @@ -1,19 +1,25 @@ from annotations.utils import preflabel -from sastadev.SAFreader import (commentsheaders, stagesheaders, - unalignedheaders, uttidheaders, levelheaders) +from sastadev.SAFreader import (commentsheaders, levelheaders, speakerheaders, + stagesheaders, unalignedheaders, uttidheaders) SAF_COMMENT_LEVEL = preflabel(commentsheaders, str.capitalize) SAF_COMMENT_COLUMN = preflabel(commentsheaders, str.capitalize) +SAF_COMMENT_HEADERS = list(map(str.lower, commentsheaders)) -SAF_UTT_LEVEL = preflabel(uttidheaders, str.capitalize) +SAF_UTT_HEADER = SAF_UTT_LEVEL = preflabel(uttidheaders, str.capitalize) SAF_UTT_LEVELS = list(map(str.lower, uttidheaders)) SAF_UNALIGNED_LEVEL = preflabel(unalignedheaders, str.capitalize) SAF_UNALIGNED_LEVELS = list(map(str.lower, unalignedheaders)) SAF_LEVEL_HEADER = preflabel(levelheaders, str.capitalize) +SAF_LEVEL_HEADERS = list(map(str.lower, levelheaders)) SAF_FASES_COLUMN = preflabel(stagesheaders, str.capitalize) +SAF_FASES_HEADERS = list(map(str.lower, stagesheaders)) + +SAF_SPEAKER_HEADER = preflabel(speakerheaders, str.capitalize) +SAF_SPEAKER_COLUMNS = list(map(str.lower, speakerheaders)) # Composed headers PRE_WORDS_HEADERS = ['ID', SAF_LEVEL_HEADER, SAF_UNALIGNED_LEVEL] diff --git a/backend/annotations/writer_tests.py b/backend/annotations/writer_tests.py index 07e621c8..9c38340d 100644 --- a/backend/annotations/writer_tests.py +++ b/backend/annotations/writer_tests.py @@ -1,3 +1,5 @@ +from annotations.constants import (SAF_COMMENT_COLUMN, SAF_FASES_COLUMN, + SAF_UNALIGNED_LEVEL) from annotations.writer import SAFWriter from .utils import ljust @@ -11,13 +13,13 @@ def test_safwriter(safwriter: SAFWriter): def test_headers(safwriter: SAFWriter): found = safwriter._annotations_header_row() - expected = ['ID', 'Level', 'Hele uiting', + expected = ['ID', 'Level', SAF_UNALIGNED_LEVEL, 'Word1', 'Word2', 'Word3', 'Word4', 'Word5', 'Word6', 'Word7', 'Word8', 'Word9', 'Word10', 'Word11', 'Word12', 'Word13', 'Word14', 'Word15', 'Word16', 'Word17', 'Word18', - 'Fases', 'Commentaar'] + SAF_FASES_COLUMN, SAF_COMMENT_COLUMN] assert found == expected @@ -44,7 +46,7 @@ def test_ljust_list(): def test_uttlevel_offset(safwriter: SAFWriter): assert safwriter._uttlevel_row_number(0, 'Samplegrootte') == 3 assert safwriter._uttlevel_row_number(0, 'Taalmaat') == 5 - assert safwriter._uttlevel_row_number(0, 'Commentaar') == 8 + assert safwriter._uttlevel_row_number(0, 'Opmerkingen') == 8 assert safwriter._uttlevel_row_number(2, 'Samplegrootte') == 17 assert safwriter._uttlevel_row_number(2, 'Taalmaat') == 19 - assert safwriter._uttlevel_row_number(2, 'Commentaar') == 22 + assert safwriter._uttlevel_row_number(2, 'Opmerkingen') == 22 From 162a7334e4e3882983ab254e3692ab83457146d7 Mon Sep 17 00:00:00 2001 From: Jelte van Boheemen Date: Wed, 31 Jan 2024 09:59:41 +0100 Subject: [PATCH 09/36] Finetune SAFWriter --- backend/analysis/views.py | 3 --- backend/annotations/constants.py | 2 +- backend/annotations/writer.py | 8 +++++--- backend/annotations/writer_tests.py | 21 +++++++++------------ 4 files changed, 15 insertions(+), 19 deletions(-) diff --git a/backend/analysis/views.py b/backend/analysis/views.py index 42b344e2..80e81af4 100644 --- a/backend/analysis/views.py +++ b/backend/analysis/views.py @@ -106,12 +106,9 @@ def annotate(self, request, *args, **kwargs): transcript, method, True, zc_embed ) - # spreadsheet = annotations_to_xlsx(allresults, method) writer = SAFWriter(method.to_sastadev(), allresults) - writer.make_workbook() spreadsheet = writer.workbook - self.create_analysis_run(transcript, method, spreadsheet) format = request.data.get('format', 'xlsx') diff --git a/backend/annotations/constants.py b/backend/annotations/constants.py index 614e355c..175b225a 100644 --- a/backend/annotations/constants.py +++ b/backend/annotations/constants.py @@ -22,7 +22,7 @@ SAF_SPEAKER_COLUMNS = list(map(str.lower, speakerheaders)) # Composed headers -PRE_WORDS_HEADERS = ['ID', SAF_LEVEL_HEADER, SAF_UNALIGNED_LEVEL] +PRE_WORDS_HEADERS = [SAF_UTT_HEADER, SAF_LEVEL_HEADER, SAF_UNALIGNED_LEVEL] POST_WORDS_HEADERS = [SAF_FASES_COLUMN, SAF_COMMENT_COLUMN] PRIMARY_COLOR = '3f51b5' diff --git a/backend/annotations/writer.py b/backend/annotations/writer.py index 4bed732f..36885677 100644 --- a/backend/annotations/writer.py +++ b/backend/annotations/writer.py @@ -1,6 +1,7 @@ +from io import BytesIO import itertools from dataclasses import dataclass, field -from typing import Dict, List, TextIO, Tuple +from typing import Dict, List, Tuple from analysis.models import MethodCategory from openpyxl import Workbook @@ -48,10 +49,11 @@ def __post_init__(self) -> None: } self.utt_n_rows = (len(all_levels)) self.anno_headers = self._annotations_header_row() + self.make_workbook() - def write(self, target: TextIO) -> None: + def write(self, target: BytesIO) -> None: '''Write the completed output file''' - self.workbook.save('/Users/a3248526/Documents/sasta_saf_test.xlsx') + self.workbook.save(target) def make_workbook(self) -> None: '''Create the complete workbook. diff --git a/backend/annotations/writer_tests.py b/backend/annotations/writer_tests.py index 9c38340d..44c5dbfc 100644 --- a/backend/annotations/writer_tests.py +++ b/backend/annotations/writer_tests.py @@ -1,24 +1,21 @@ from annotations.constants import (SAF_COMMENT_COLUMN, SAF_FASES_COLUMN, - SAF_UNALIGNED_LEVEL) + SAF_LEVEL_HEADER, SAF_UNALIGNED_LEVEL, + SAF_UTT_HEADER) from annotations.writer import SAFWriter from .utils import ljust def test_safwriter(safwriter: SAFWriter): - safwriter.make_workbook() - safwriter.write('/Users/a3248526/Documents/saf_writer_test.xlsx') + with open('/Users/a3248526/Documents/saf_writer_test.xlsx', 'wb') as f: + safwriter.write(f) assert safwriter def test_headers(safwriter: SAFWriter): found = safwriter._annotations_header_row() - expected = ['ID', 'Level', SAF_UNALIGNED_LEVEL, - 'Word1', 'Word2', 'Word3', 'Word4', - 'Word5', 'Word6', 'Word7', 'Word8', - 'Word9', 'Word10', 'Word11', 'Word12', - 'Word13', 'Word14', 'Word15', 'Word16', - 'Word17', 'Word18', + expected = [SAF_UTT_HEADER, SAF_LEVEL_HEADER, SAF_UNALIGNED_LEVEL, + *[f'Word{n}' for n in range(1, 19)], SAF_FASES_COLUMN, SAF_COMMENT_COLUMN] assert found == expected @@ -27,9 +24,9 @@ def test_uttlevel_row(safwriter: SAFWriter): id = 1 words = safwriter.results.allutts[id] found = safwriter._uttlevel_row(id, words) - expected = [1, 'Uiting', None, 'ja', 'uh', 'ik', 'vind', 'het', 'beetje', - 'moeilijk', 'om', 'het', 'goed', 'te', 'vertellen', 'want', - 'ik', 'heb', 'een', 'ongeluk', 'gehad', None, None] + expected = [1, SAF_UTT_HEADER, None, 'ja', 'uh', 'ik', 'vind', 'het', + 'beetje', 'moeilijk', 'om', 'het', 'goed', 'te', 'vertellen', + 'want', 'ik', 'heb', 'een', 'ongeluk', 'gehad', None, None] assert found == expected From 745134d8cc86dd09f0029b00109fabf322f261ca Mon Sep 17 00:00:00 2001 From: Jelte van Boheemen Date: Wed, 7 Feb 2024 10:23:26 +0100 Subject: [PATCH 10/36] Beginning of lemmas filling --- backend/annotations/constants.py | 7 +++++-- backend/annotations/writer.py | 10 ++++++++-- backend/conftest.py | 11 ++++++++--- backend/setup.cfg | 2 +- 4 files changed, 22 insertions(+), 8 deletions(-) diff --git a/backend/annotations/constants.py b/backend/annotations/constants.py index 175b225a..7785c74f 100644 --- a/backend/annotations/constants.py +++ b/backend/annotations/constants.py @@ -1,6 +1,7 @@ from annotations.utils import preflabel -from sastadev.SAFreader import (commentsheaders, levelheaders, speakerheaders, - stagesheaders, unalignedheaders, uttidheaders) +from sastadev.SAFreader import (commentsheaders, levelheaders, literallevels, + speakerheaders, stagesheaders, + unalignedheaders, uttidheaders) SAF_COMMENT_LEVEL = preflabel(commentsheaders, str.capitalize) SAF_COMMENT_COLUMN = preflabel(commentsheaders, str.capitalize) @@ -21,6 +22,8 @@ SAF_SPEAKER_HEADER = preflabel(speakerheaders, str.capitalize) SAF_SPEAKER_COLUMNS = list(map(str.lower, speakerheaders)) +SAF_LITERAL_LEVELS = list(map(str.lower, literallevels)) + # Composed headers PRE_WORDS_HEADERS = [SAF_UTT_HEADER, SAF_LEVEL_HEADER, SAF_UNALIGNED_LEVEL] POST_WORDS_HEADERS = [SAF_FASES_COLUMN, SAF_COMMENT_COLUMN] diff --git a/backend/annotations/writer.py b/backend/annotations/writer.py index 36885677..5b869cb6 100644 --- a/backend/annotations/writer.py +++ b/backend/annotations/writer.py @@ -1,6 +1,6 @@ -from io import BytesIO import itertools from dataclasses import dataclass, field +from io import BytesIO from typing import Dict, List, Tuple from analysis.models import MethodCategory @@ -10,7 +10,7 @@ from sastadev.allresults import AllResults, ResultsKey from sastadev.methods import Method from sastadev.sastatypes import ExactResults - +from sastadev.ASTApostfunctions import getposlemmas from .constants import (POST_WORDS_HEADERS, PRE_WORDS_HEADERS, SAF_COMMENT_LEVEL, SAF_UTT_LEVEL) from .utils import autosize_columns, format_worksheet, get_max_words, ljust @@ -49,6 +49,7 @@ def __post_init__(self) -> None: } self.utt_n_rows = (len(all_levels)) self.anno_headers = self._annotations_header_row() + self._getlemmas() self.make_workbook() def write(self, target: BytesIO) -> None: @@ -160,3 +161,8 @@ def _append_fase(self, row: int, fase: str) -> None: current.add(fase) new = sep.join(sorted(list(current))) fase_cell.value = new + + def _getlemmas(self): + res = getposlemmas(self.results, ('A051', 'A051')) + assert True + # assert False diff --git a/backend/conftest.py b/backend/conftest.py index 966f58d2..e0f71487 100644 --- a/backend/conftest.py +++ b/backend/conftest.py @@ -9,6 +9,8 @@ from sastadev.allresults import AllResults from sastadev.conf import settings as sd_settings +from lxml import etree + @pytest.fixture def cha_testfiles_dir(): @@ -79,7 +81,10 @@ def asta_method(db, asta_category, method_dir): @pytest.fixture -def single_utt_allresults(): +def single_utt_allresults(cha_testfiles_dir): + parsed = etree.parse( + op.join(cha_testfiles_dir, 'single_utt_corrected.xml')) + utts = parsed.xpath('alpino_ds') return AllResults( uttcount=1, coreresults={'A029': Counter({1: 1}), 'A045': Counter({1: 1}), @@ -93,8 +98,8 @@ def single_utt_allresults(): postresults={'A046': Counter(), 'A049': Counter()}, allmatches=None, # Not provided in this fixture filename='single_utt', - analysedtrees=[(1, None)], - annotationinput=False, + analysedtrees=[(n + 1, tree) for n, tree in enumerate(utts)], + annotationinput=True, allutts={1: ['ja', 'uh', 'ik', 'vind', 'het', 'beetje', 'moeilijk', 'om', 'het', 'goed', 'te', 'vertellen', 'want', 'ik', 'heb', 'een', 'ongeluk', 'gehad']} diff --git a/backend/setup.cfg b/backend/setup.cfg index 344cc0c5..f6e96ba2 100644 --- a/backend/setup.cfg +++ b/backend/setup.cfg @@ -11,7 +11,7 @@ ignore_missing_imports = True exclude = sastadev .env -ignore = W503 +ignore = E501, W503 max-complexity = 10 # pytest fixtures need this funky import stye, hence the ignore F401 and F811 per-file-ignores = From 21c86fc96f1b3879321f3911a8f44ed5d30685a7 Mon Sep 17 00:00:00 2001 From: Jelte van Boheemen Date: Wed, 3 Apr 2024 11:03:10 +0200 Subject: [PATCH 11/36] Simple querying using sastadev --- backend/analysis/query/query_transcript.py | 46 ++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 backend/analysis/query/query_transcript.py diff --git a/backend/analysis/query/query_transcript.py b/backend/analysis/query/query_transcript.py new file mode 100644 index 00000000..60f47d96 --- /dev/null +++ b/backend/analysis/query/query_transcript.py @@ -0,0 +1,46 @@ +from collections import namedtuple +from typing import Tuple +from analysis.models import AssessmentMethod, Transcript +from sastadev.sastacore import SastaCoreParameters, sastacore +from sastadev.targets import get_targets +from lxml import etree +from sastadev.methods import Method + + +def prepare_parameters(infilename: str, method: Method, targets: int, annotate: bool) -> SastaCoreParameters: + # TODO: check corr/corrn + + return SastaCoreParameters( + annotationinput=annotate, + themethod=method.to_sastadev(), + infilename=infilename, + targets=targets + ) + + +def prepare_treebanks(transcript: Transcript) -> Tuple[Tuple[str, etree.ElementTree]]: + orig_fp = transcript.parsed_content.path + corr_fp = transcript.corrected_content.path + orig_treebank = etree.parse(orig_fp).getroot() + corr_treebank = etree.parse(corr_fp).getroot() + return ( + (orig_fp, orig_treebank), + (corr_fp, corr_treebank) + ) + + +def run_sastacore(transcript: Transcript, method: AssessmentMethod, annotate: bool = False): + orig_tb, corr_tb = prepare_treebanks(transcript) + + # Retrieve targets from corrected treebank + targets = get_targets(corr_tb[1]) + params = prepare_parameters(corr_tb[0], method, targets, annotate) + + res = sastacore( + origtreebank=orig_tb[1], + correctedtreebank=corr_tb[1], + annotatedfileresults=None, + scp=params + ) + + return res From 6eaa8ed3df63bc202804640c974ea56c0e2fe091 Mon Sep 17 00:00:00 2001 From: Jelte van Boheemen Date: Wed, 3 Apr 2024 16:47:44 +0200 Subject: [PATCH 12/36] Remove old annotations to xlsx --- backend/analysis/query/xlsx_output.py | 188 +------------------------- 1 file changed, 4 insertions(+), 184 deletions(-) diff --git a/backend/analysis/query/xlsx_output.py b/backend/analysis/query/xlsx_output.py index b14d5792..06408998 100644 --- a/backend/analysis/query/xlsx_output.py +++ b/backend/analysis/query/xlsx_output.py @@ -1,32 +1,24 @@ -import traceback from collections import Counter from typing import List, Tuple -from analysis.annotations.constants import SAF_COMMENT_LEVEL, SAF_UNALIGNED_LEVEL from analysis.models import AssessmentMethod -from analysis.query.functions import QueryWithFunction from analysis.results.results import AllResults from openpyxl import Workbook -from openpyxl.styles import Font, PatternFill -from openpyxl.styles.protection import Protection +from openpyxl.styles import Font from openpyxl.utils import get_column_letter from openpyxl.worksheet.dimensions import ColumnDimension, DimensionHolder -ROMAN_NUMS = [None, 'I', 'II', 'III', - 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X'] +QUERYCOUNT_HEADERS = ['Query', 'Item', 'Phase', 'Utterance', 'Matches'] -BEFORE_WORDS_HEADERS = ['ID', 'Level', SAF_UNALIGNED_LEVEL] -AFTER_WORDS_HEADERS = ['Dummy', 'Fases', SAF_COMMENT_LEVEL] - -def querycounts_to_xlsx(allresults: AllResults, queries: List[QueryWithFunction]): +def querycounts_to_xlsx(allresults: AllResults, method: AssessmentMethod): all_data = dict(allresults.coreresults, **allresults.postresults) wb = Workbook() worksheet = wb.active # header - worksheet.append(['Query', 'Item', 'Fase', 'Utterance', 'Matches']) + worksheet.append(QUERYCOUNT_HEADERS) header = worksheet["1:1"] for cell in header: cell.font = Font(bold=True) @@ -68,178 +60,6 @@ def querycounts_to_xlsx(allresults: AllResults, queries: List[QueryWithFunction] return wb - -def annotations_to_xlsx(allresults, method): - try: - wb = Workbook() - worksheet = wb.active - - items = sorted(allresults.annotations.items()) - max_words = max([len(words) for (_, words) in items]) - headers = get_headers(max_words) - worksheet.append(headers) - - zc_embeddings = method.category.zc_embeddings - - levels, lower_levels = get_levels(method) - - for utt_id, words in items: - # Utt row, containing the word tokens - words_row = [utt_id, 'Utt'] + [w.word for w in words] - - # a cell for each word, and one to record phases - level_rows = make_levels_rows(max_words, levels, utt_id) - - if zc_embeddings: - zc_rows = make_zc_rows(max_words, utt_id, words) - else: - zc_rows = None - - comment_rows = make_levels_rows(max_words, ['Commentaar'], utt_id) - - for word in words: - process_word(zc_embeddings, lower_levels, level_rows, zc_rows, comment_rows, word.index, word) - - append_utterance_rows( - worksheet, - words_row, - level_rows, - zc_rows, - comment_rows - ) - - format_worksheet(worksheet) - autosize_columns(worksheet) - - return wb - - except Exception: - traceback.print_exc() - - -def process_word(zc_embeddings, lower_levels, level_rows, zc_rows, comment_rows, i_word, word) -> None: - '''Iterate over word hits and fill the corresponding level''' - for hit in word.hits: - if zc_embeddings and hit['level'].lower() == 'zc': - i_level = word.zc_embedding - process_hit(zc_rows, i_word, hit, i_level) - else: - i_level = lower_levels.index(hit['level'].lower()) - process_hit(level_rows, i_word, hit, i_level) - if word.comments: - comment_rows[0][get_word_column(i_word)].add(word.comments) - - -def process_hit(rows, i_word: int, hit, i_level: int) -> None: - '''Add the hit to the right place in the rows, and append the fase as roman numeral''' - rows[i_level][get_word_column(i_word)].add(hit['item']) - try: - rows[i_level][-1].append( - ROMAN_NUMS[int(hit['fase'])]) - except Exception: - pass - - -def get_word_column(word_index: int) -> int: - # Substract 1 because position 0 is unaligned, which is present in BEFORE_WORDS_HEADERS - return word_index + len(BEFORE_WORDS_HEADERS) - 1 - - -def append_utterance_rows(worksheet, words_row, levels_rows, zc_rows, comment_rows) -> None: - '''Append all rows for an utterance: - words - levels - zc levels (optional) - ''' - worksheet.append(words_row) - append_level_rows(levels_rows, worksheet) - append_level_rows(zc_rows, worksheet) - append_level_rows(comment_rows, worksheet) - - -def concat_cell(cell): - if (isinstance(cell, set) or isinstance(cell, list)): - try: - return ','.join(sorted(cell)) or None - except Exception: - return None - return cell - - -def append_level_rows(rows, worksheet) -> None: - '''Condense cells to comma separated strings and append them to worksheet''' - if not rows: - return - for row in rows: - row = [concat_cell(cell) - for cell in row] - worksheet.append(row) - - -def make_levels_rows(max_words: int, levels: List[str], utt_id: int): - level_rows = [ - [utt_id, level] - + [set([])] # unaligned - + [set([]) for _ in range(max_words + 1)] - + [[]] # fases - # Everything after fases is undefined so fases are easy to find with -1 - for level in levels - ] - return level_rows - - -def make_zc_rows(max_words: int, utt_id: int, words): - '''Rows for Zc levels. At least one, but more if deeper embeddings are present. - ''' - embed_levels = {w.zc_embedding for w in words} - max_embed = max(embed_levels) - zc_levels = ['Zc'] * (max_embed + 1) # N + 1 Zc levels - return make_levels_rows(max_words, zc_levels, utt_id) - - -def get_headers(max_words: int) -> List[str]: - word_headers = [f'Word{i}' for i in range(1, max_words + 1)] - headers = BEFORE_WORDS_HEADERS + word_headers + AFTER_WORDS_HEADERS - - return headers - - -def get_levels(method: AssessmentMethod) -> Tuple[List[str], List[str]]: - '''Lowercased list of all levels (excluding ZC)''' - levels = method.category.levels - if method.category.zc_embeddings: - levels = [lv for lv in levels if lv.lower() != 'Zc'.lower()] - lower_levels = list(map(str.lower, levels)) - return levels, lower_levels - - -def format_worksheet(worksheet) -> None: - '''Locks all cells except annotation fields. Gives utterance rows a yellow background.''' - - # start by locking the entire sheet - worksheet.protection.sheet = True - unlocked = Protection(locked=False) - - header = worksheet["1:1"] - for cell in header: - # bold headers - cell.font = Font(bold=True) - - # yelow background for each utterance row - for row in list(worksheet.rows)[1:]: - if row[1].value == 'Utt': - for cell in row: - cell.fill = PatternFill( - start_color="ffff00", - end_color="ffff00", - fill_type="solid") - else: - # unlock non-utterance rows - # skip the first two columns (utt number and level) - for cell in row[2:]: - cell.protection = unlocked - - def autosize_columns(worksheet) -> None: dim_holder = DimensionHolder(worksheet=worksheet) for col in range(worksheet.min_column, worksheet.max_column + 1): From 4ecfbedbe30cd22ff757a1e788e52160a71102db Mon Sep 17 00:00:00 2001 From: Jelte van Boheemen Date: Tue, 9 Apr 2024 14:21:05 +0200 Subject: [PATCH 13/36] Adapt SAFWriter to ResultsKey --- backend/annotations/writer.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/backend/annotations/writer.py b/backend/annotations/writer.py index 5b869cb6..bd33d1fa 100644 --- a/backend/annotations/writer.py +++ b/backend/annotations/writer.py @@ -115,8 +115,14 @@ def _make_levels_rows(self, ws: Worksheet) -> None: def _fill_query(self, query_id: ResultsKey, exact_results: ExactResults): '''Find and fill all cells for a single query''' - query = self.method.queries.get(query_id) - item = query.item + lemma_item = None + if isinstance(query_id, Tuple) and not query_id[0] == query_id[1]: + # Lemma queries hold the lemma in second position + lemma_item = query_id[1] + + simple_query_id = query_id[0] + query = self.method.queries.get(simple_query_id) + item = lemma_item or query.item fase = query.fase for utt_id, word_nr in exact_results: From 07efed66cdf0afab4c813e7e09416106e5d5c458 Mon Sep 17 00:00:00 2001 From: Jelte van Boheemen Date: Tue, 9 Apr 2024 14:34:01 +0200 Subject: [PATCH 14/36] Annotate (xlsx) in sastacore --- backend/analysis/query/query_transcript.py | 14 +++++++++----- backend/analysis/query/run.py | 11 +++++++++-- backend/analysis/views.py | 19 +++++++++---------- backend/annotations/conftest.py | 2 +- backend/annotations/writer_tests.py | 2 +- .../annotations/{writer.py => writer_xlsx.py} | 0 6 files changed, 29 insertions(+), 19 deletions(-) rename backend/annotations/{writer.py => writer_xlsx.py} (100%) diff --git a/backend/analysis/query/query_transcript.py b/backend/analysis/query/query_transcript.py index 60f47d96..41418196 100644 --- a/backend/analysis/query/query_transcript.py +++ b/backend/analysis/query/query_transcript.py @@ -1,4 +1,3 @@ -from collections import namedtuple from typing import Tuple from analysis.models import AssessmentMethod, Transcript from sastadev.sastacore import SastaCoreParameters, sastacore @@ -7,11 +6,16 @@ from sastadev.methods import Method -def prepare_parameters(infilename: str, method: Method, targets: int, annotate: bool) -> SastaCoreParameters: +def prepare_parameters(infilename: str, method: Method, targets: int, annotationinput: bool) -> SastaCoreParameters: # TODO: check corr/corrn + if annotationinput: + # If existing annotations exist + # dont supply origtreebank + pass + return SastaCoreParameters( - annotationinput=annotate, + annotationinput=annotationinput, themethod=method.to_sastadev(), infilename=infilename, targets=targets @@ -29,12 +33,12 @@ def prepare_treebanks(transcript: Transcript) -> Tuple[Tuple[str, etree.ElementT ) -def run_sastacore(transcript: Transcript, method: AssessmentMethod, annotate: bool = False): +def run_sastacore(transcript: Transcript, method: AssessmentMethod, annotation_input: bool = False): orig_tb, corr_tb = prepare_treebanks(transcript) # Retrieve targets from corrected treebank targets = get_targets(corr_tb[1]) - params = prepare_parameters(corr_tb[0], method, targets, annotate) + params = prepare_parameters(corr_tb[0], method, targets, annotation_input) res = sastacore( origtreebank=orig_tb[1], diff --git a/backend/analysis/query/run.py b/backend/analysis/query/run.py index 119f1905..1d1f9650 100644 --- a/backend/analysis/query/run.py +++ b/backend/analysis/query/run.py @@ -1,12 +1,14 @@ import logging from collections import Counter, defaultdict from typing import Dict, List, Set - from analysis.annotations.safreader import SAFReader from analysis.models import (AnalysisRun, AssessmentMethod, AssessmentQuery, Transcript, Utterance) -from analysis.results.results import AllResults, SastaMatches, SastaResults +from analysis.query.query_transcript import run_sastacore +from analysis.results.results import SastaMatches, SastaResults + from sastadev.query import Query, core_process, post_process, pre_process +from sastadev.allresults import AllResults from .functions import (QueryWithFunction, compile_queries, filter_queries, single_query_single_utt, utt_from_tree) @@ -14,6 +16,11 @@ logger = logging.getLogger('sasta') +def annotate_transcript(transcript: Transcript, method: AssessmentMethod) -> AllResults: + allresults, _samplesize = run_sastacore(transcript, method) + return allresults + + def query_transcript(transcript: Transcript, method: AssessmentMethod, annotate: bool = False, diff --git a/backend/analysis/views.py b/backend/analysis/views.py index 80e81af4..e3f213d7 100644 --- a/backend/analysis/views.py +++ b/backend/analysis/views.py @@ -5,11 +5,11 @@ import logging from io import BytesIO, StringIO -from analysis.annotations.enrich_chat import enrich_chat +from annotations.writer_cha import enrich_chat from analysis.annotations.safreader import SAFReader -from analysis.query.run import query_transcript -from analysis.query.xlsx_output import annotations_to_xlsx, querycounts_to_xlsx -from annotations.writer import SAFWriter +from analysis.query.run import annotate_transcript, query_transcript +from analysis.query.xlsx_output import querycounts_to_xlsx +from annotations.writer_xlsx import SAFWriter from celery import group from convert.chat_writer import ChatWriter from django.db.models import Q @@ -96,21 +96,20 @@ def query(self, request, *args, **kwargs): @action(detail=True, methods=['POST'], name='Annotate') def annotate(self, request, *args, **kwargs): + # Retrieve objects transcript = self.get_object() method_id = request.data.get('method') - method = AssessmentMethod.objects.get(pk=method_id) - zc_embed = method.category.zc_embeddings - allresults, queries_with_funcs = query_transcript( - transcript, method, True, zc_embed - ) + # Perform the actual querying + allresults = annotate_transcript(transcript, method) + # Always create an XLSX file for AnalysisRun purposes writer = SAFWriter(method.to_sastadev(), allresults) spreadsheet = writer.workbook - self.create_analysis_run(transcript, method, spreadsheet) + # Adapt output to requested format format = request.data.get('format', 'xlsx') if format == 'xlsx': diff --git a/backend/annotations/conftest.py b/backend/annotations/conftest.py index 66638b95..7a2271d9 100644 --- a/backend/annotations/conftest.py +++ b/backend/annotations/conftest.py @@ -1,7 +1,7 @@ import pytest -from annotations.writer import SAFWriter +from annotations.writer_xlsx import SAFWriter @pytest.fixture diff --git a/backend/annotations/writer_tests.py b/backend/annotations/writer_tests.py index 44c5dbfc..9c3df229 100644 --- a/backend/annotations/writer_tests.py +++ b/backend/annotations/writer_tests.py @@ -1,7 +1,7 @@ from annotations.constants import (SAF_COMMENT_COLUMN, SAF_FASES_COLUMN, SAF_LEVEL_HEADER, SAF_UNALIGNED_LEVEL, SAF_UTT_HEADER) -from annotations.writer import SAFWriter +from annotations.writer_xlsx import SAFWriter from .utils import ljust diff --git a/backend/annotations/writer.py b/backend/annotations/writer_xlsx.py similarity index 100% rename from backend/annotations/writer.py rename to backend/annotations/writer_xlsx.py From e351863e77c0a0aed6cc55b43723cc2ae3e82c00 Mon Sep 17 00:00:00 2001 From: Jelte van Boheemen Date: Tue, 9 Apr 2024 14:35:25 +0200 Subject: [PATCH 15/36] Add natsort package --- backend/requirements.in | 1 + backend/requirements.txt | 2 ++ 2 files changed, 3 insertions(+) diff --git a/backend/requirements.in b/backend/requirements.in index f54d7d0c..70612e33 100644 --- a/backend/requirements.in +++ b/backend/requirements.in @@ -8,6 +8,7 @@ django-livereload-server django-rest-auth[with_social] django-revproxy>=0.9.16 lxml==4.9.1 +natsort numpy<1.22 pandas==1.3.* psycopg2 diff --git a/backend/requirements.txt b/backend/requirements.txt index 73e2c8c6..03fc781e 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -122,6 +122,8 @@ murmurhash==1.0.7 # preshed # spacy # thinc +natsort==8.4.0 + # via -r requirements.in numpy==1.21.6 # via # -r requirements.in From 42dba3613402fb2d2210bfea196bed508c911b52 Mon Sep 17 00:00:00 2001 From: Jelte van Boheemen Date: Tue, 9 Apr 2024 15:25:31 +0200 Subject: [PATCH 16/36] Annotate (cha) in sastacore --- backend/analysis/annotations/enrich_chat.py | 42 ------------- backend/annotations/writer_cha.py | 66 +++++++++++++++++++++ 2 files changed, 66 insertions(+), 42 deletions(-) delete mode 100644 backend/analysis/annotations/enrich_chat.py create mode 100644 backend/annotations/writer_cha.py diff --git a/backend/analysis/annotations/enrich_chat.py b/backend/analysis/annotations/enrich_chat.py deleted file mode 100644 index cfb43e2c..00000000 --- a/backend/analysis/annotations/enrich_chat.py +++ /dev/null @@ -1,42 +0,0 @@ -import itertools -from typing import List - -from analysis.models import AssessmentMethod, Transcript -from analysis.results.results import AllResults -from chamd.chat_reader import ChatLine, ChatTier -from convert.chat_reader import ChatDocument - - -def find_doc_line(lines: List[ChatLine], uttno: int) -> ChatLine: - # TODO: more efficient way to do this? - return next((x for x in lines if x.uttid == uttno), None) - - -def enrich_chat(transcript: Transcript, - allresults: AllResults, - method: AssessmentMethod) -> ChatDocument: - doc = ChatDocument.from_chatfile(transcript.content.path, transcript.corpus.method_category) - - # construct a mapping of uttno to uttid - # because uttid is unknown to CHAT - marked_utts = (x for x in transcript.utterances.all() if x.for_analysis) - id_no_mapping = { - u.utt_id: u.uttno for u in marked_utts - } - - items = sorted(allresults.annotations.items()) - for utt_id, words in items: - uttno = id_no_mapping.get(utt_id) - doc_line = find_doc_line(doc.lines, uttno) - flattened_hits = itertools.chain(*(w.hits for w in words)) - annotations = [x.get('item') for x in flattened_hits] - if annotations: - annotation_str = ', '.join(annotations) - doc_line.tiers['xsyn'] = ChatTier(id='xsyn', text=annotation_str) - # id_headers = [h for h in doc.headers if h.line.startswith('@ID')] - # last_id_header = max(id_headers, key=attrgetter('linestartno')) - # doc.headers.append(ChatHeader( - # line=f'@Comment:\tAnnotations on %xsyn-tiers generated by SASTA, using {method.category.name}', - # linestartno=last_id_header.linestartno+1)) - - return doc diff --git a/backend/annotations/writer_cha.py b/backend/annotations/writer_cha.py new file mode 100644 index 00000000..36793db7 --- /dev/null +++ b/backend/annotations/writer_cha.py @@ -0,0 +1,66 @@ +from collections import defaultdict +from typing import Dict, List + +from analysis.models import AssessmentMethod, Transcript +from analysis.results.results import AllResults +from chamd.chat_reader import ChatLine, ChatTier +from convert.chat_reader import ChatDocument +from natsort import natsorted +from sastadev.sastatypes import ExactResultsDict + + +def _items_by_utt_word(exactresults: ExactResultsDict, items_mapping: Dict) -> Dict: + # filter out empty + results = {k: v for k, v in exactresults.items() if v} + + # create nested defaultdict: dictionary of dictionaries of lists + out = defaultdict(lambda: defaultdict(list)) + + for (qid, _), hits in results.items(): + for (utt_id, wordno) in hits: + out[utt_id][wordno].append(items_mapping.get(qid)) + + return out + + +def _find_doc_line(lines: List[ChatLine], uttno: int) -> ChatLine: + # TODO: more efficient way to do this? + return next((x for x in lines if x.uttid == uttno), None) + + +def enrich_chat(transcript: Transcript, + allresults: AllResults, + method: AssessmentMethod) -> ChatDocument: + doc = ChatDocument.from_chatfile( + transcript.content.path, transcript.corpus.method_category) + + # construct a mapping of uttno to uttid + # because uttid is unknown to CHAT + marked_utts = (x for x in transcript.utterances.all() if x.for_analysis) + id_no_mapping = { + u.utt_id: u.uttno for u in marked_utts + } + + # create mapping of query_ids to items + items_mapping = {q.query_id: q.item for q in method.queries.all()} + + results_by_word = _items_by_utt_word( + allresults.exactresults, items_mapping) + + for utt_id, words in results_by_word.items(): + uttno = id_no_mapping.get(int(utt_id)) + doc_line = _find_doc_line(doc.lines, uttno) + + utt_hits = [] + for w in natsorted(words.keys()): + utt_hits.extend(words[w]) + + annotation_str = ', '.join(utt_hits) + doc_line.tiers['xsyn'] = ChatTier(id='xsyn', text=annotation_str) + # id_headers = [h for h in doc.headers if h.line.startswith('@ID')] + # last_id_header = max(id_headers, key=attrgetter('linestartno')) + # doc.headers.append(ChatHeader( + # line=f'@Comment:\tAnnotations on %xsyn-tiers generated by SASTA, using {method.category.name}', + # linestartno=last_id_header.linestartno+1)) + + return doc From 9051d806c6a5c81f1e277c86e7dd8022466d6782 Mon Sep 17 00:00:00 2001 From: Jelte van Boheemen Date: Tue, 9 Apr 2024 15:29:16 +0200 Subject: [PATCH 17/36] Generate form in sastacore --- backend/analysis/views.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/backend/analysis/views.py b/backend/analysis/views.py index e3f213d7..22774ad6 100644 --- a/backend/analysis/views.py +++ b/backend/analysis/views.py @@ -178,19 +178,17 @@ def upload_annotations(self, request, *args, **kwargs): @action(detail=True, methods=['POST'], name='Generate form') def generateform(self, request, *args, **kwargs): + # Retrieve objects transcript = self.get_object() method_id = request.data.get('method') method = AssessmentMethod.objects.get(pk=method_id) - zc_embed = method.category.zc_embeddings # Find the form function for this method form_func = method.category.get_form_function() if not form_func: raise ParseError(detail='No form definition for this method.') - allresults, _ = query_transcript( - transcript, method, annotate=False, zc_embed=zc_embed, - ) + allresults = annotate_transcript(transcript, method) form = form_func(allresults, None, in_memory=True) From fedb9ad959c9033ffa3bddf14673ee6b1f55a104 Mon Sep 17 00:00:00 2001 From: Jelte van Boheemen Date: Tue, 9 Apr 2024 16:38:47 +0200 Subject: [PATCH 18/36] Querycounts in sastacore --- backend/analysis/query/xlsx_output.py | 68 ----------------------- backend/analysis/views.py | 8 +-- backend/annotations/writer_querycounts.py | 68 +++++++++++++++++++++++ 3 files changed, 72 insertions(+), 72 deletions(-) delete mode 100644 backend/analysis/query/xlsx_output.py create mode 100644 backend/annotations/writer_querycounts.py diff --git a/backend/analysis/query/xlsx_output.py b/backend/analysis/query/xlsx_output.py deleted file mode 100644 index 06408998..00000000 --- a/backend/analysis/query/xlsx_output.py +++ /dev/null @@ -1,68 +0,0 @@ -from collections import Counter -from typing import List, Tuple - -from analysis.models import AssessmentMethod -from analysis.results.results import AllResults -from openpyxl import Workbook -from openpyxl.styles import Font -from openpyxl.utils import get_column_letter -from openpyxl.worksheet.dimensions import ColumnDimension, DimensionHolder - -QUERYCOUNT_HEADERS = ['Query', 'Item', 'Phase', 'Utterance', 'Matches'] - - -def querycounts_to_xlsx(allresults: AllResults, method: AssessmentMethod): - all_data = dict(allresults.coreresults, **allresults.postresults) - - wb = Workbook() - worksheet = wb.active - - # header - worksheet.append(QUERYCOUNT_HEADERS) - header = worksheet["1:1"] - for cell in header: - cell.font = Font(bold=True) - - query_mapping = { - q.query.id: (q.query.fase or 0, q.query.item) - for q in queries - if q.query.id in all_data - } - sorted_queries = sorted( - sorted( - query_mapping.items(), - key=lambda item: item[0] - ), - key=lambda item: item[1][0] - ) - - for qid, (fase, item) in sorted_queries: - fase = fase if fase else 'nvt' - data = all_data[qid] - - if isinstance(data, int): - row = [qid, item, fase, 'total', data] - worksheet.append(row) - elif isinstance(data, Counter): - first_row = [qid, item, fase, 'total', sum(data.values())] - worksheet.append(first_row) - for utt in sorted(data): - if isinstance(utt, Tuple): - row = [None, None, None, utt[-1], data[utt]] - else: - row = [None, None, None, utt, data[utt]] - worksheet.append(row) - - worksheet.auto_filter.ref = worksheet.dimensions - - # column widths - autosize_columns(worksheet) - - return wb - -def autosize_columns(worksheet) -> None: - dim_holder = DimensionHolder(worksheet=worksheet) - for col in range(worksheet.min_column, worksheet.max_column + 1): - dim_holder[get_column_letter(col)] = ColumnDimension( - worksheet, min=col, max=col, auto_size=True) - worksheet.column_dimensions = dim_holder diff --git a/backend/analysis/views.py b/backend/analysis/views.py index 22774ad6..a9919815 100644 --- a/backend/analysis/views.py +++ b/backend/analysis/views.py @@ -5,10 +5,10 @@ import logging from io import BytesIO, StringIO +from annotations.writer_querycounts import querycounts_to_xlsx from annotations.writer_cha import enrich_chat from analysis.annotations.safreader import SAFReader -from analysis.query.run import annotate_transcript, query_transcript -from analysis.query.xlsx_output import querycounts_to_xlsx +from analysis.query.run import annotate_transcript from annotations.writer_xlsx import SAFWriter from celery import group from convert.chat_writer import ChatWriter @@ -87,9 +87,9 @@ def query(self, request, *args, **kwargs): content_type=SPREADSHEET_MIMETYPE) response['Content-Disposition'] = "attachment; filename=matches_output.xlsx" - allresults, queries_with_funcs = query_transcript(transcript, method) + allresults = annotate_transcript(transcript, method) - spreadsheet = querycounts_to_xlsx(allresults, queries_with_funcs) + spreadsheet = querycounts_to_xlsx(allresults, method) spreadsheet.save(response) return response diff --git a/backend/annotations/writer_querycounts.py b/backend/annotations/writer_querycounts.py new file mode 100644 index 00000000..4a546980 --- /dev/null +++ b/backend/annotations/writer_querycounts.py @@ -0,0 +1,68 @@ +from collections import Counter, defaultdict + +from analysis.models import AssessmentMethod +from annotations.constants import SAF_FASES_COLUMN, SAF_UTT_HEADER +from natsort import natsorted +from openpyxl import Workbook +from openpyxl.styles import Font +from openpyxl.utils import get_column_letter +from openpyxl.worksheet.dimensions import ColumnDimension, DimensionHolder +from sastadev.allresults import AllResults +from sastadev.reduceresults import exact2results + +QUERYCOUNT_HEADERS = ['Query', 'Item', + SAF_FASES_COLUMN, SAF_UTT_HEADER, 'Matches'] + +TOTAL_LABEL = 'totaal' +NOT_APPLICABLE_LABEL = 'nvt' + + +def querycounts_to_xlsx(allresults: AllResults, method: AssessmentMethod): + wb = Workbook() + worksheet = wb.active + + # header + worksheet.append(QUERYCOUNT_HEADERS) + header = worksheet["1:1"] + for cell in header: + cell.font = Font(bold=True) + + nonempty_queries = {k: v for k, v in allresults.exactresults.items() if v} + + res = exact2results(nonempty_queries) + + # need to reduce the results + # because of double results for lemma queries + reduced_results = defaultdict(Counter) + for (k, _), v in res.items(): + reduced_results[k] += v + + # write rows of data + for qid in natsorted(reduced_results): + # get query info + cntr = reduced_results[qid] + q = method.queries.get(query_id=qid) + + # write the total row + total_row = [qid, q.item, q.fase or NOT_APPLICABLE_LABEL, + TOTAL_LABEL, sum(cntr.values())] + worksheet.append(total_row) + + for utt_id in natsorted(cntr.keys()): + row = [None, None, None, utt_id, cntr[utt_id]] + worksheet.append(row) + + worksheet.auto_filter.ref = worksheet.dimensions + + # column widths + autosize_columns(worksheet) + + return wb + + +def autosize_columns(worksheet) -> None: + dim_holder = DimensionHolder(worksheet=worksheet) + for col in range(worksheet.min_column, worksheet.max_column + 1): + dim_holder[get_column_letter(col)] = ColumnDimension( + worksheet, min=col, max=col, auto_size=True) + worksheet.column_dimensions = dim_holder From 5aee0cc17ec89147fb8969eefa9321bbdbc13ba4 Mon Sep 17 00:00:00 2001 From: Jelte van Boheemen Date: Wed, 10 Apr 2024 10:38:09 +0200 Subject: [PATCH 19/36] Fix SAFWriter --- backend/annotations/writer_xlsx.py | 9 ++------- backend/conftest.py | 28 ++++++++++++++++++---------- 2 files changed, 20 insertions(+), 17 deletions(-) diff --git a/backend/annotations/writer_xlsx.py b/backend/annotations/writer_xlsx.py index bd33d1fa..a57041ac 100644 --- a/backend/annotations/writer_xlsx.py +++ b/backend/annotations/writer_xlsx.py @@ -10,7 +10,6 @@ from sastadev.allresults import AllResults, ResultsKey from sastadev.methods import Method from sastadev.sastatypes import ExactResults -from sastadev.ASTApostfunctions import getposlemmas from .constants import (POST_WORDS_HEADERS, PRE_WORDS_HEADERS, SAF_COMMENT_LEVEL, SAF_UTT_LEVEL) from .utils import autosize_columns, format_worksheet, get_max_words, ljust @@ -49,7 +48,6 @@ def __post_init__(self) -> None: } self.utt_n_rows = (len(all_levels)) self.anno_headers = self._annotations_header_row() - self._getlemmas() self.make_workbook() def write(self, target: BytesIO) -> None: @@ -127,6 +125,8 @@ def _fill_query(self, query_id: ResultsKey, exact_results: ExactResults): for utt_id, word_nr in exact_results: # We cannot assume that utterances are numbered 1-N sequentially + if isinstance(utt_id, str): + utt_id = int(utt_id) utt_nr = list(self.results.allutts.keys()).index(utt_id) row, col = self._cell_location(utt_nr, query.level, word_nr) cell = self.anno_ws.cell(row, col) @@ -167,8 +167,3 @@ def _append_fase(self, row: int, fase: str) -> None: current.add(fase) new = sep.join(sorted(list(current))) fase_cell.value = new - - def _getlemmas(self): - res = getposlemmas(self.results, ('A051', 'A051')) - assert True - # assert False diff --git a/backend/conftest.py b/backend/conftest.py index e0f71487..8deeed8c 100644 --- a/backend/conftest.py +++ b/backend/conftest.py @@ -85,17 +85,25 @@ def single_utt_allresults(cha_testfiles_dir): parsed = etree.parse( op.join(cha_testfiles_dir, 'single_utt_corrected.xml')) utts = parsed.xpath('alpino_ds') + return AllResults( - uttcount=1, - coreresults={'A029': Counter({1: 1}), 'A045': Counter({1: 1}), - 'A001': Counter({1: 1}), 'A003': Counter({1: 2}), - 'A013': Counter({1: 1}), 'A018': Counter({1: 2}), - 'A021': Counter({1: 2}), 'A024': Counter({1: 2})}, - exactresults={'A029': [(1, 1)], 'A045': [(1, 2)], 'A001': [(1, 7)], - 'A003': [(1, 8), (1, 13)], 'A013': [(1, 4)], - 'A018': [(1, 12), (1, 18)], 'A021': [(1, 6), (1, 17)], - 'A024': [(1, 4), (1, 15)]}, - postresults={'A046': Counter(), 'A049': Counter()}, + uttcount=2, + coreresults={('A029', 'A029'): Counter({'1': 1}), ('A045', 'A045'): Counter({'1': 1}), + ('A001', 'A001'): Counter({'1': 1}), ('A003', 'A003'): Counter({'1': 2}), + ('A013', 'A013'): Counter({'1': 1}), ('A018', 'A018'): Counter({'1': 2}), + ('A021', 'A021'): Counter({'1': 2}), ('A024', 'A024'): Counter({'1': 2}), + ('A051', 'beet'): Counter({'1': 1}), ('A051', 'vertellen'): Counter({'1': 1}), + ('A051', 'ongeluk'): Counter({'1': 1}), ('A051', 'hebben'): Counter({'1': 1})}, + + exactresults={('A029', 'A029'): [('1', 1)], ('A045', 'A045'): [('1', 2)], + ('A001', 'A001'): [('1', 7)], ('A003', 'A003'): [('1', 8), ('1', 13)], + ('A013', 'A013'): [('1', 4)], ('A018', 'A018'): [('1', 12), ('1', 18)], + ('A021', 'A021'): [('1', 6), ('1', 17)], ('A024', 'A024'): [('1', 4), ('1', 15)], + ('A051', 'beet'): [('1', 6)], ('A051', 'vertellen'): [('1', 12)], + ('A051', 'ongeluk'): [('1', 17)], ('A051', 'hebben'): [('1', 18)], + }, + postresults={'A046': Counter({('beet', '1'): 1, ('ongeluk', '1'): 1}), + 'A049': Counter({('vertellen', '1'): 1, ('hebben', '1'): 1})}, allmatches=None, # Not provided in this fixture filename='single_utt', analysedtrees=[(n + 1, tree) for n, tree in enumerate(utts)], From 33940a469a0697228a1c68775e8c37b738751dd0 Mon Sep 17 00:00:00 2001 From: Jelte van Boheemen Date: Wed, 10 Apr 2024 13:11:14 +0200 Subject: [PATCH 20/36] Better fix for SAFWriter --- backend/analysis/query/run.py | 1 - backend/annotations/writer_xlsx.py | 7 ++++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/backend/analysis/query/run.py b/backend/analysis/query/run.py index 1d1f9650..ecddd4c3 100644 --- a/backend/analysis/query/run.py +++ b/backend/analysis/query/run.py @@ -15,7 +15,6 @@ logger = logging.getLogger('sasta') - def annotate_transcript(transcript: Transcript, method: AssessmentMethod) -> AllResults: allresults, _samplesize = run_sastacore(transcript, method) return allresults diff --git a/backend/annotations/writer_xlsx.py b/backend/annotations/writer_xlsx.py index a57041ac..d0845906 100644 --- a/backend/annotations/writer_xlsx.py +++ b/backend/annotations/writer_xlsx.py @@ -125,9 +125,10 @@ def _fill_query(self, query_id: ResultsKey, exact_results: ExactResults): for utt_id, word_nr in exact_results: # We cannot assume that utterances are numbered 1-N sequentially - if isinstance(utt_id, str): - utt_id = int(utt_id) - utt_nr = list(self.results.allutts.keys()).index(utt_id) + try: + utt_nr = list(self.results.allutts.keys()).index(utt_id) + except ValueError: + utt_nr = list(self.results.allutts.keys()).index(int(utt_id)) row, col = self._cell_location(utt_nr, query.level, word_nr) cell = self.anno_ws.cell(row, col) self._append_item(cell, item) From 7ca78771e1e63651aeed6834803d0f23b99be87b Mon Sep 17 00:00:00 2001 From: Jelte van Boheemen Date: Wed, 10 Apr 2024 13:55:07 +0200 Subject: [PATCH 21/36] Use latest run in analysis in sastacore --- backend/analysis/models.py | 7 +++ backend/analysis/query/query_transcript.py | 53 ++++++++++++++++++---- backend/analysis/query/run.py | 10 +++- 3 files changed, 59 insertions(+), 11 deletions(-) diff --git a/backend/analysis/models.py b/backend/analysis/models.py index 81616b38..e10cdd96 100644 --- a/backend/analysis/models.py +++ b/backend/analysis/models.py @@ -206,6 +206,13 @@ def convertable(self): def parseable(self): return self.status in (self.CONVERTED, self.PARSING_FAILED) + @property + def latest_run(self): + try: + return self.analysisruns.latest() + except AnalysisRun.DoesNotExist: + return None + class Utterance(models.Model): sentence = models.CharField(max_length=500) diff --git a/backend/analysis/query/query_transcript.py b/backend/analysis/query/query_transcript.py index 41418196..d2815cf3 100644 --- a/backend/analysis/query/query_transcript.py +++ b/backend/analysis/query/query_transcript.py @@ -4,6 +4,8 @@ from sastadev.targets import get_targets from lxml import etree from sastadev.methods import Method +from sastadev.SAFreader import get_golddata, richscores2scores +from sastadev.allresults import AllResults def prepare_parameters(infilename: str, method: Method, targets: int, annotationinput: bool) -> SastaCoreParameters: @@ -12,11 +14,12 @@ def prepare_parameters(infilename: str, method: Method, targets: int, annotation if annotationinput: # If existing annotations exist # dont supply origtreebank + # infilename becomes the path to existing SAF file pass return SastaCoreParameters( annotationinput=annotationinput, - themethod=method.to_sastadev(), + themethod=method, infilename=infilename, targets=targets ) @@ -33,18 +36,50 @@ def prepare_treebanks(transcript: Transcript) -> Tuple[Tuple[str, etree.ElementT ) +def get_annotated_fileresults(transcript: Transcript, method: Method, includeimplies: bool = False) -> AllResults: + infilename = transcript.latest_run.annotation_file.path + allutts, richexactscores = get_golddata(infilename, method.item2idmap, method.altcodes, + method.queries, includeimplies) + exactresults = richscores2scores(richexactscores) + annotatedfileresults = AllResults(uttcount=len(allutts), + coreresults={}, + exactresults=exactresults, + postresults={}, + allmatches={}, + filename=infilename, + analysedtrees=[], + allutts=allutts, + annotationinput=True) + return annotatedfileresults + + def run_sastacore(transcript: Transcript, method: AssessmentMethod, annotation_input: bool = False): + # get treebanks orig_tb, corr_tb = prepare_treebanks(transcript) - # Retrieve targets from corrected treebank targets = get_targets(corr_tb[1]) - params = prepare_parameters(corr_tb[0], method, targets, annotation_input) + # Convert method to sastdaev version + sdmethod = method.to_sastadev() - res = sastacore( - origtreebank=orig_tb[1], - correctedtreebank=corr_tb[1], - annotatedfileresults=None, - scp=params - ) + if annotation_input: + existing_results = get_annotated_fileresults(transcript, sdmethod) + params = prepare_parameters( + transcript.latest_run.annotation_file.path, + sdmethod, targets, annotation_input) + res = sastacore( + origtreebank=None, + correctedtreebank=corr_tb[1], + annotatedfileresults=existing_results, + scp=params + ) + else: + params = prepare_parameters( + corr_tb[0], sdmethod, targets, annotation_input) + res = sastacore( + origtreebank=orig_tb[1], + correctedtreebank=corr_tb[1], + annotatedfileresults=None, + scp=params + ) return res diff --git a/backend/analysis/query/run.py b/backend/analysis/query/run.py index ecddd4c3..89ce1f03 100644 --- a/backend/analysis/query/run.py +++ b/backend/analysis/query/run.py @@ -15,8 +15,14 @@ logger = logging.getLogger('sasta') -def annotate_transcript(transcript: Transcript, method: AssessmentMethod) -> AllResults: - allresults, _samplesize = run_sastacore(transcript, method) + +def annotate_transcript(transcript: Transcript, method: AssessmentMethod, ignore_existing: bool = False) -> AllResults: + if transcript.latest_run and not ignore_existing: + # run sastacore with pre-exising SAF file + allresults, _samplesize = run_sastacore(transcript, method, True) + else: + # run sastacore normally + allresults, _samplesize = run_sastacore(transcript, method, False) return allresults From acfb884646f73586e745063236d6d6412daae821 Mon Sep 17 00:00:00 2001 From: Jelte van Boheemen Date: Wed, 10 Apr 2024 13:57:20 +0200 Subject: [PATCH 22/36] Move annotations writers to subdir --- backend/analysis/views.py | 6 +++--- backend/annotations/conftest.py | 2 +- backend/annotations/writer_tests.py | 2 +- .../{writer_querycounts.py => writers/querycounts.py} | 0 backend/annotations/{writer_cha.py => writers/saf_chat.py} | 0 backend/annotations/{writer_xlsx.py => writers/saf_xlsx.py} | 6 +++--- 6 files changed, 8 insertions(+), 8 deletions(-) rename backend/annotations/{writer_querycounts.py => writers/querycounts.py} (100%) rename backend/annotations/{writer_cha.py => writers/saf_chat.py} (100%) rename backend/annotations/{writer_xlsx.py => writers/saf_xlsx.py} (96%) diff --git a/backend/analysis/views.py b/backend/analysis/views.py index a9919815..7136a218 100644 --- a/backend/analysis/views.py +++ b/backend/analysis/views.py @@ -5,11 +5,11 @@ import logging from io import BytesIO, StringIO -from annotations.writer_querycounts import querycounts_to_xlsx -from annotations.writer_cha import enrich_chat from analysis.annotations.safreader import SAFReader from analysis.query.run import annotate_transcript -from annotations.writer_xlsx import SAFWriter +from annotations.writers.querycounts import querycounts_to_xlsx +from annotations.writers.saf_chat import enrich_chat +from annotations.writers.saf_xlsx import SAFWriter from celery import group from convert.chat_writer import ChatWriter from django.db.models import Q diff --git a/backend/annotations/conftest.py b/backend/annotations/conftest.py index 7a2271d9..c4a43274 100644 --- a/backend/annotations/conftest.py +++ b/backend/annotations/conftest.py @@ -1,7 +1,7 @@ import pytest -from annotations.writer_xlsx import SAFWriter +from annotations.saf_xlsx import SAFWriter @pytest.fixture diff --git a/backend/annotations/writer_tests.py b/backend/annotations/writer_tests.py index 9c3df229..d8d6cb42 100644 --- a/backend/annotations/writer_tests.py +++ b/backend/annotations/writer_tests.py @@ -1,7 +1,7 @@ from annotations.constants import (SAF_COMMENT_COLUMN, SAF_FASES_COLUMN, SAF_LEVEL_HEADER, SAF_UNALIGNED_LEVEL, SAF_UTT_HEADER) -from annotations.writer_xlsx import SAFWriter +from annotations.saf_xlsx import SAFWriter from .utils import ljust diff --git a/backend/annotations/writer_querycounts.py b/backend/annotations/writers/querycounts.py similarity index 100% rename from backend/annotations/writer_querycounts.py rename to backend/annotations/writers/querycounts.py diff --git a/backend/annotations/writer_cha.py b/backend/annotations/writers/saf_chat.py similarity index 100% rename from backend/annotations/writer_cha.py rename to backend/annotations/writers/saf_chat.py diff --git a/backend/annotations/writer_xlsx.py b/backend/annotations/writers/saf_xlsx.py similarity index 96% rename from backend/annotations/writer_xlsx.py rename to backend/annotations/writers/saf_xlsx.py index d0845906..34f779dd 100644 --- a/backend/annotations/writer_xlsx.py +++ b/backend/annotations/writers/saf_xlsx.py @@ -10,9 +10,9 @@ from sastadev.allresults import AllResults, ResultsKey from sastadev.methods import Method from sastadev.sastatypes import ExactResults -from .constants import (POST_WORDS_HEADERS, PRE_WORDS_HEADERS, - SAF_COMMENT_LEVEL, SAF_UTT_LEVEL) -from .utils import autosize_columns, format_worksheet, get_max_words, ljust +from annotations.constants import (POST_WORDS_HEADERS, PRE_WORDS_HEADERS, + SAF_COMMENT_LEVEL, SAF_UTT_LEVEL) +from annotations.utils import autosize_columns, format_worksheet, get_max_words, ljust @dataclass From 1214360a451744bedfc76df037a18feecd0e07b4 Mon Sep 17 00:00:00 2001 From: Jelte van Boheemen Date: Wed, 10 Apr 2024 14:14:55 +0200 Subject: [PATCH 23/36] Use sastadev reader when uploading annotations --- backend/analysis/query/query_transcript.py | 22 ++++------------------ backend/analysis/views.py | 11 +++++++---- backend/annotations/reader.py | 21 +++++++++++++++++++++ 3 files changed, 32 insertions(+), 22 deletions(-) create mode 100644 backend/annotations/reader.py diff --git a/backend/analysis/query/query_transcript.py b/backend/analysis/query/query_transcript.py index d2815cf3..3e1333d7 100644 --- a/backend/analysis/query/query_transcript.py +++ b/backend/analysis/query/query_transcript.py @@ -7,6 +7,8 @@ from sastadev.SAFreader import get_golddata, richscores2scores from sastadev.allresults import AllResults +from annotations.reader import read_saf + def prepare_parameters(infilename: str, method: Method, targets: int, annotationinput: bool) -> SastaCoreParameters: # TODO: check corr/corrn @@ -36,23 +38,6 @@ def prepare_treebanks(transcript: Transcript) -> Tuple[Tuple[str, etree.ElementT ) -def get_annotated_fileresults(transcript: Transcript, method: Method, includeimplies: bool = False) -> AllResults: - infilename = transcript.latest_run.annotation_file.path - allutts, richexactscores = get_golddata(infilename, method.item2idmap, method.altcodes, - method.queries, includeimplies) - exactresults = richscores2scores(richexactscores) - annotatedfileresults = AllResults(uttcount=len(allutts), - coreresults={}, - exactresults=exactresults, - postresults={}, - allmatches={}, - filename=infilename, - analysedtrees=[], - allutts=allutts, - annotationinput=True) - return annotatedfileresults - - def run_sastacore(transcript: Transcript, method: AssessmentMethod, annotation_input: bool = False): # get treebanks orig_tb, corr_tb = prepare_treebanks(transcript) @@ -62,7 +47,8 @@ def run_sastacore(transcript: Transcript, method: AssessmentMethod, annotation_i sdmethod = method.to_sastadev() if annotation_input: - existing_results = get_annotated_fileresults(transcript, sdmethod) + existing_results = read_saf( + transcript.latest_run.annotation_file.path, sdmethod) params = prepare_parameters( transcript.latest_run.annotation_file.path, sdmethod, targets, annotation_input) diff --git a/backend/analysis/views.py b/backend/analysis/views.py index 7136a218..17f8a71e 100644 --- a/backend/analysis/views.py +++ b/backend/analysis/views.py @@ -7,6 +7,7 @@ from analysis.annotations.safreader import SAFReader from analysis.query.run import annotate_transcript +from annotations.reader import read_saf from annotations.writers.querycounts import querycounts_to_xlsx from annotations.writers.saf_chat import enrich_chat from annotations.writers.saf_xlsx import SAFWriter @@ -164,15 +165,17 @@ def upload_annotations(self, request, *args, **kwargs): new_run = self.create_analysis_run(obj, latest_run.method, file, is_manual=True) try: - reader = SAFReader(new_run.annotation_file.path, latest_run.method, obj) + read_saf(new_run.annotation_file.path, + latest_run.method.to_sastadev()) except Exception as e: new_run.delete() logger.exception(e) return Response(str(e), status.HTTP_400_BAD_REQUEST) - if reader.errors: - new_run.delete() - return Response(reader.formatted_errors(), status.HTTP_400_BAD_REQUEST) + # TODO: re-enable proper error logging for reading SAF files + # if reader.errors: + # new_run.delete() + # return Response(reader.formatted_errors(), status.HTTP_400_BAD_REQUEST) return Response('Success', status.HTTP_200_OK) diff --git a/backend/annotations/reader.py b/backend/annotations/reader.py new file mode 100644 index 00000000..99cf4c56 --- /dev/null +++ b/backend/annotations/reader.py @@ -0,0 +1,21 @@ +from sastadev.allresults import AllResults +from sastadev.methods import Method +from sastadev.SAFreader import get_golddata, richscores2scores + + +def read_saf(saf_filename: str, method: Method, includeimplies: bool = False) -> AllResults: + '''Wrapper around SASTADEV SAF reader''' + infilename = saf_filename + allutts, richexactscores = get_golddata(infilename, method.item2idmap, method.altcodes, + method.queries, includeimplies) + exactresults = richscores2scores(richexactscores) + annotatedfileresults = AllResults(uttcount=len(allutts), + coreresults={}, + exactresults=exactresults, + postresults={}, + allmatches={}, + filename=infilename, + analysedtrees=[], + allutts=allutts, + annotationinput=True) + return annotatedfileresults From c803bda7ad08672d403b3fc7bb34a3ba453f7c0c Mon Sep 17 00:00:00 2001 From: Jelte van Boheemen Date: Wed, 10 Apr 2024 14:16:30 +0200 Subject: [PATCH 24/36] Remove SASTA's own querying mechanisms --- backend/analysis/query/functions.py | 122 ------------------- backend/analysis/query/query_transcript.py | 2 - backend/analysis/query/run.py | 135 +-------------------- 3 files changed, 2 insertions(+), 257 deletions(-) delete mode 100644 backend/analysis/query/functions.py diff --git a/backend/analysis/query/functions.py b/backend/analysis/query/functions.py deleted file mode 100644 index 3679379e..00000000 --- a/backend/analysis/query/functions.py +++ /dev/null @@ -1,122 +0,0 @@ -import logging -from operator import attrgetter -from typing import Callable, Dict, List, Union - -from analysis.models import AssessmentMethod, AssessmentQuery -from analysis.results.results import UtteranceWord -from analysis.score.zc_embedding import get_zc_embeddings -from bs4 import BeautifulSoup as Soup -from django.db.models import Q -from lxml import etree as ET -from sastadev.external_functions import form_map, str2functionmap -from sastadev.macros import expandmacros, macrodict -from sastadev.query import Query - -logger = logging.getLogger('sasta') - - -class QueryWithFunction: - def __init__(self, query, function): - self.id: str = query.id - self.query: Query = query - self.function: Union[Callable, ET.XPath] = function - - def __repr__(self): - return f'{self.id}: {type(self.function)}' - - -def compile_queries(queries: List[Query]) -> List[QueryWithFunction]: - results = [] - # macrodict = get_macros_dict() - - for query_model in queries: - query = query_model.to_sastadev() - func = compile_xpath_or_func(query.query, macrodict) - if func: - results.append(QueryWithFunction(query, func)) - return results - - -def compile_xpath_or_func(query: str, - macrodict: Dict) -> Union[Callable, ET.XPath]: - try: - if query in str2functionmap: - return str2functionmap[query] - expanded_query = expandmacros(query) - return ET.XPath(expanded_query) - except Exception as error: - logger.warning(f'cannot compile {query.strip()}:\t{error}') - return None - - -def filter_queries(method: AssessmentMethod, - phase: int = None, - phase_exact: bool = True): - ''' - # TODO: remove phase filtering? - phase_exact:True returns only that phase - False returns everything up to (and including) that phase - ''' - try: - form_queries = [f.__name__ for f in form_map.values()] - all_queries = AssessmentQuery.objects.all().filter( - Q(method=method) - & Q(query__isnull=False) - & ~Q(query__exact='') - & ~Q(query__in=form_queries) - & Q(inform='yes') - ) - if phase: - phase_filter = Q(fase=phase) if phase_exact else Q( - fase__gte=phase) - phase_queries = all_queries.filter(phase_filter) - return phase_queries - return all_queries - - except Exception as e: - logger.warning(f'cannot filter queries for phase:\t{e}') - print(e) - - -def single_query_single_utt(query_func: Union[Callable, ET.XPath], - syntree: ET._Element) -> List[ET._Element]: - try: - results = query_func(syntree) - return results - except Exception: - logger.warning(f'Failed to execute {query_func}') - return [] - - -def utt_from_tree(tree: str, embeddings=False) -> List[UtteranceWord]: - # From a LASSY syntax tree, construct utterance representation - # Output: sorted list of UtteranceWord instances - soup = Soup(tree, 'lxml') - utt = soup.alpino_ds - - embed_dict = get_zc_embeddings(ET.fromstring(tree)) if embeddings else None - - words = utt.findAll('node', {'word': True}) - - unaligned = UtteranceWord( - word='', - begin=-1, - end=0, - hits=[], - zc_embedding=0 if embed_dict else None - ) - - utt_words = [unaligned] + [UtteranceWord( - word=w.get('word'), - begin=w.get('begin'), - end=w.get('end'), - hits=[], - zc_embedding=embed_dict[str(w.get('begin'))] if embed_dict else None) - for w in words] - - # Sort the words and assign their real index - sorted_words = sorted(utt_words, key=attrgetter('begin')) - for i, w in enumerate(sorted_words): - w.index = i - - return sorted_words diff --git a/backend/analysis/query/query_transcript.py b/backend/analysis/query/query_transcript.py index 3e1333d7..c888add9 100644 --- a/backend/analysis/query/query_transcript.py +++ b/backend/analysis/query/query_transcript.py @@ -4,8 +4,6 @@ from sastadev.targets import get_targets from lxml import etree from sastadev.methods import Method -from sastadev.SAFreader import get_golddata, richscores2scores -from sastadev.allresults import AllResults from annotations.reader import read_saf diff --git a/backend/analysis/query/run.py b/backend/analysis/query/run.py index 89ce1f03..6eb8462d 100644 --- a/backend/analysis/query/run.py +++ b/backend/analysis/query/run.py @@ -1,18 +1,9 @@ import logging -from collections import Counter, defaultdict -from typing import Dict, List, Set -from analysis.annotations.safreader import SAFReader -from analysis.models import (AnalysisRun, AssessmentMethod, AssessmentQuery, - Transcript, Utterance) -from analysis.query.query_transcript import run_sastacore -from analysis.results.results import SastaMatches, SastaResults -from sastadev.query import Query, core_process, post_process, pre_process +from analysis.models import (AssessmentMethod, Transcript) +from analysis.query.query_transcript import run_sastacore from sastadev.allresults import AllResults -from .functions import (QueryWithFunction, compile_queries, filter_queries, - single_query_single_utt, utt_from_tree) - logger = logging.getLogger('sasta') @@ -24,125 +15,3 @@ def annotate_transcript(transcript: Transcript, method: AssessmentMethod, ignore # run sastacore normally allresults, _samplesize = run_sastacore(transcript, method, False) return allresults - - -def query_transcript(transcript: Transcript, - method: AssessmentMethod, - annotate: bool = False, - zc_embed: bool = False): - # TODO: LOGGING - - queries: List[AssessmentQuery] = filter_queries(method) - queries_with_funcs: List[QueryWithFunction] = compile_queries(queries) - utterances: List[Utterance] = Utterance.objects.filter( - transcript=transcript) - to_analyze_utterances = [x for x in utterances if x.for_analysis] - utterance_syntrees = [(x.utt_id, x.syntree) for x in to_analyze_utterances] - allutts = {utt.utt_id: utt.word_list for utt in to_analyze_utterances} - logger.info( - f'Analyzing {len(to_analyze_utterances)} of {len(utterances)} utterances..') - - coreresults, allmatches, exact_results, corelevels, annotations = run_core_queries( - to_analyze_utterances, - queries_with_funcs, - zc_embed, - annotate) - - annotationinput = False - runs = AnalysisRun.objects.filter(transcript=transcript) - if runs: # An annotations file exists, base further results on this - latest_run = runs.latest() - reader = SAFReader(filepath=latest_run.annotation_file.path, - method=method, transcript=transcript) - coreresults = reader.document.to_allresults().coreresults - annotations = reader.document.reformatted_annotations - exact_results = reader.document.exactresults - annotationinput = True - - allresults = AllResults(filename=transcript.name, - uttcount=len(to_analyze_utterances), - coreresults=coreresults, - exactresults=exact_results, - postresults=None, - allmatches=allmatches, - annotations=annotations, - analysedtrees=utterance_syntrees, - annotationinput=annotationinput, - allutts=allutts - ) - - run_post_queries(allresults, queries_with_funcs) - return allresults, queries_with_funcs - - -def run_core_queries(utterances: List[Utterance], - queries: List[QueryWithFunction], - zc_embed: bool, - annotate: bool): - levels: Set[str] = set([]) - allmatches: SastaMatches = defaultdict(list) - results: SastaResults = {} - annotations = {} - exact_results = defaultdict(list) - - core_queries: List[QueryWithFunction] = sorted( - [q for q in queries if q.query.process in [pre_process, core_process]], - key=lambda x: (x.query.process, x.query.id)) - - for utt in utterances: - if annotate: - utt_res = utt_from_tree(utt.parse_tree, zc_embed) - for q in core_queries: - matches = single_query_single_utt(q.function, utt.syntree) - if matches: - if q.id in results: - results[q.id].update( - {utt.utt_id: len(matches)}) - else: - results[q.id] = Counter( - {utt.utt_id: len(matches)}) - for m in matches: - levels.add(q.query.level) - # Record the match including the syntree - allmatches[(q.id, utt.utt_id)].append((m, utt.syntree)) - # Record the exact word where the query was matched - - word_index = next((i for i, item in enumerate( - utt.word_position_mapping) if item["begin"] == int(m.get('begin'))), None) - # exact_results[q.id].append((utt.utt_id, int(m.get('begin')) + 1)) - exact_results[q.id].append((utt.utt_id, word_index)) - - if annotate: - begin = int(m.get('begin')) - hit = { - 'level': q.query.level, - 'item': q.query.item, - 'fase': q.query.fase - } - matched_word = next( - (w for w in utt_res if w.begin == begin), None) - if matched_word: - matched_word.hits.append(hit) - else: - logger.warning( - f'Found hit ({q.query.level}, {q.query.item}, {q.query.fase}) for non-exising begin attr "{begin}"') - if annotate: - annotations[utt.utt_id] = utt_res - - return (results, allmatches, exact_results, levels, annotations or None) - - -def run_post_queries(allresults: SastaResults, - queries: List[QueryWithFunction]) -> None: - post_queries: List[QueryWithFunction] = [ - q for q in queries if q.query.process == post_process] - flat_queries: Dict[str, Query] = {q.id: q.query for q in queries} - - for q in post_queries: - try: - result = q.function(allresults, flat_queries) - if result is not None: - allresults.postresults[q.id] = result - except Exception as e: - # logger.warning(f'Failed to execute {q.function}') - logger.exception(e) From ff418743e2bce5cd5fec9eb3457b831905a41c47 Mon Sep 17 00:00:00 2001 From: Jelte van Boheemen Date: Wed, 10 Apr 2024 14:21:51 +0200 Subject: [PATCH 25/36] Fix (skip) SAFReader tests --- backend/analysis/annotations/safreader_test.py | 5 ++++- backend/annotations/conftest.py | 2 +- backend/annotations/writer_tests.py | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/backend/analysis/annotations/safreader_test.py b/backend/analysis/annotations/safreader_test.py index 47d4157e..5804b589 100644 --- a/backend/analysis/annotations/safreader_test.py +++ b/backend/analysis/annotations/safreader_test.py @@ -3,11 +3,11 @@ import pytest from analysis.annotations.safreader import SAFReader, get_word_levels -from analysis.query.run import query_transcript from pandas import DataFrame from pytest_lazyfixture import lazy_fixture +@pytest.mark.skip(reason="SAFReader replaced by SASTADEV equivalent") @pytest.mark.parametrize("method, transcript, filedir, samplenum", [ (lazy_fixture("tarsp_method"), lazy_fixture("tarsp_transcript"), lazy_fixture("cha_testfiles_dir"), 5), @@ -53,6 +53,7 @@ def test_read_saf(method, transcript, filedir, samplenum): assert true_results.allutts == read_results.allutts +@pytest.mark.skip(reason="SAFReader replaced by SASTADEV equivalent") def test_astalex(asta_method, asta_transcript, asta_transcript_corrections, cha_testfiles_dir): true_results, _ = query_transcript(asta_transcript, asta_method, annotate=True, zc_embed=False) assert true_results.annotationinput @@ -62,6 +63,7 @@ def test_astalex(asta_method, asta_transcript, asta_transcript_corrections, cha_ assert True +@pytest.mark.skip(reason="SAFReader replaced by SASTADEV equivalent") def test_wordlevels(): data = {'level': map(str.lower, ['Uiting', 'QA', 'SZ', 'Grammaticale Fout', 'Opmerkingen']), @@ -72,6 +74,7 @@ def test_wordlevels(): assert word_levels == ['qa', 'sz', 'grammaticale fout'] +@pytest.mark.skip(reason="SAFReader replaced by SASTADEV equivalent") def test_read_saf_comments(tarsp_method, tarsp_transcript, cha_testfiles_dir): reader = SAFReader( op.join( diff --git a/backend/annotations/conftest.py b/backend/annotations/conftest.py index c4a43274..00eabfe2 100644 --- a/backend/annotations/conftest.py +++ b/backend/annotations/conftest.py @@ -1,7 +1,7 @@ import pytest -from annotations.saf_xlsx import SAFWriter +from annotations.writers.saf_xlsx import SAFWriter @pytest.fixture diff --git a/backend/annotations/writer_tests.py b/backend/annotations/writer_tests.py index d8d6cb42..eceda8a6 100644 --- a/backend/annotations/writer_tests.py +++ b/backend/annotations/writer_tests.py @@ -1,7 +1,7 @@ from annotations.constants import (SAF_COMMENT_COLUMN, SAF_FASES_COLUMN, SAF_LEVEL_HEADER, SAF_UNALIGNED_LEVEL, SAF_UTT_HEADER) -from annotations.saf_xlsx import SAFWriter +from annotations.writers.saf_xlsx import SAFWriter from .utils import ljust From 854ebd4fc114954e176a0e14b8a24e02e59bad5e Mon Sep 17 00:00:00 2001 From: Jelte van Boheemen Date: Wed, 10 Apr 2024 14:25:25 +0200 Subject: [PATCH 26/36] Remove redundant analysis.annotations module --- backend/analysis/annotations/__init__.py | 0 .../analysis/annotations/annotation_format.py | 125 ------------ backend/analysis/annotations/constants.py | 31 --- backend/analysis/annotations/safreader.py | 178 ------------------ .../analysis/annotations/safreader_test.py | 87 --------- backend/analysis/annotations/utils.py | 105 ----------- backend/analysis/models.py | 21 --- backend/analysis/views.py | 1 - 8 files changed, 548 deletions(-) delete mode 100644 backend/analysis/annotations/__init__.py delete mode 100644 backend/analysis/annotations/annotation_format.py delete mode 100644 backend/analysis/annotations/constants.py delete mode 100644 backend/analysis/annotations/safreader.py delete mode 100644 backend/analysis/annotations/safreader_test.py delete mode 100644 backend/analysis/annotations/utils.py diff --git a/backend/analysis/annotations/__init__.py b/backend/analysis/annotations/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/backend/analysis/annotations/annotation_format.py b/backend/analysis/annotations/annotation_format.py deleted file mode 100644 index 853a3685..00000000 --- a/backend/analysis/annotations/annotation_format.py +++ /dev/null @@ -1,125 +0,0 @@ -import operator -from collections import Counter, defaultdict -from functools import reduce -from typing import Dict, List, Optional - -from analysis.models import AssessmentMethod -from analysis.results.results import (AllResults, SastaAllUtts, SastaAnnotations, SastaExactResults, - UtteranceWord) - - -class SAFAnnotation: - def __init__(self, level, label, fase=None, query_id=None): - self.level: str = level - self.label: str = label - self.fase: str = fase - self.query_id: Optional[str] = query_id - - -class SAFDocument: - def __init__(self, name, method, all_levels=None): - self.name: str = name - self.method: AssessmentMethod = method - self.utterances: List[SAFUtterance] = [] - self.all_levels: Optional[List[str]] = all_levels - self.annotations: SastaAnnotations = {} - self.exactresults: SastaExactResults = defaultdict(list) - self.allutts: SastaAllUtts = defaultdict(list) - - @property - def all_annotations(self): - return reduce(operator.concat, - [utt.annotations for utt in self.utterances]) - - @property - def queries(self): - '''Set of all query IDs in the document''' - return set( - ann.query_id for ann in self.all_annotations - ) - - @property - def item_counts(self): - return {u.utt_id: u.item_counts for u in self.utterances} - - def to_allresults(self) -> AllResults: - '''Convert to AllResults object (for query and scoring).''' - filename = self.name - uttcount = len(self.utterances) - results = { - q: Counter({ - u.utt_id: u.item_counts[q] - for u in self.utterances - if u.item_counts[q] > 0 - }) - for q in self.queries - } - - allresults = AllResults( - filename, - uttcount, - coreresults=results, - exactresults=self.exactresults, - allutts=self.allutts - ) - - return allresults - - @property - def reformatted_annotations(self) -> Dict[int, List[UtteranceWord]]: - annotations = {} - for utt in self.utterances: - annotations[utt.utt_id] = [] - for word in utt.words: - uw = UtteranceWord( - word=word.text, - begin=word.idx - 1, # TODO: does this need to be normalized? - end=word.idx, - hits=[], - idx=word.idx, - zc_embedding=0, # TODO: CHECK ZC EMBEDS, - comments=word.comment - ) - for ann in word.annotations: - hit = self.hit_from_annotation(ann) - uw.hits.append(hit) - annotations[utt.utt_id].append(uw) - return annotations - - def hit_from_annotation(self, ann) -> Dict: - q = self.method.queries.get(query_id=ann.query_id) - # Try to match the actual alt item - item_matches = [ai for ai in q.altitems if ai.lower() == ann.label.lower()] - return { - 'level': q.level, - 'item': item_matches[0] if item_matches else q.item, - 'fase': q.fase - } - - -class SAFUtterance: - def __init__(self, utt_id): - self.utt_id: int = utt_id - self.words: List[SAFWord] = [] - - @property - def item_counts(self): - return sum([w.item_counts for w in self.words], Counter()) - - @property - def annotations(self): - return reduce(operator.concat, [w.annotations for w in self.words]) - - -class SAFWord: - def __init__(self, idx, text, begin, end, comment=None): - self.idx: int = idx - self.begin: int = begin - self.end: int = end - self.text: str = text - self.annotations: List[SAFAnnotation] = [] - self.comment: str = comment or '' - - @property - def item_counts(self): - return Counter({a.query_id for a in self.annotations if a.query_id}) diff --git a/backend/analysis/annotations/constants.py b/backend/analysis/annotations/constants.py deleted file mode 100644 index 1865c5f1..00000000 --- a/backend/analysis/annotations/constants.py +++ /dev/null @@ -1,31 +0,0 @@ -from typing import Counter, Dict, Optional, Tuple - -from annotations.constants import (SAF_COMMENT_COLUMN, SAF_COMMENT_HEADERS, SAF_COMMENT_LEVEL, SAF_FASES_COLUMN, - SAF_FASES_HEADERS, SAF_LEVEL_HEADER, SAF_LEVEL_HEADERS, - SAF_SPEAKER_COLUMNS, SAF_SPEAKER_HEADER, SAF_UNALIGNED_LEVEL, SAF_UNALIGNED_LEVELS, SAF_UTT_HEADER, SAF_UTT_LEVELS) - -# Type annotations -TupleStrDict = Dict[Tuple[Optional[str], Optional[str]], str] -CounterDict = Dict[str, Counter[str]] - -# Global -ITEMSEPPATTERN = r'[,-; ]' -LABELSEP = ',' - -HEADER_VARIANTS = { - SAF_UTT_HEADER.lower(): SAF_UTT_LEVELS, - SAF_SPEAKER_HEADER.lower(): SAF_SPEAKER_COLUMNS, - SAF_UNALIGNED_LEVEL.lower(): SAF_UNALIGNED_LEVELS, - SAF_LEVEL_HEADER.lower(): SAF_LEVEL_HEADERS, - SAF_FASES_COLUMN.lower(): SAF_FASES_HEADERS, - SAF_COMMENT_COLUMN.lower(): SAF_COMMENT_HEADERS - -} - - -PREFIX = "" -ALTITEMSEP = IMPLIESSEP = ',' - -# Define (lowercased) levels that should not be cleaned -# Currently, only comment rows should be excempt -NO_CLEAN_LEVELS = [*SAF_COMMENT_HEADERS] diff --git a/backend/analysis/annotations/safreader.py b/backend/analysis/annotations/safreader.py deleted file mode 100644 index b60dd31f..00000000 --- a/backend/analysis/annotations/safreader.py +++ /dev/null @@ -1,178 +0,0 @@ -import logging -import os -from typing import List, Optional, Tuple - -import pandas as pd -from analysis.models import Transcript -from annotations.constants import (SAF_COMMENT_HEADERS, SAF_UNALIGNED_LEVEL, - SAF_UNALIGNED_LEVELS, SAF_UTT_HEADER, - SAF_UTT_LEVELS) - -from .annotation_format import (SAFAnnotation, SAFDocument, SAFUtterance, - SAFWord) -from .constants import LABELSEP, PREFIX -from .utils import (clean_item, clean_row, enrich, getlabels, item2queryid, - mkpatterns, standardize_header_name) - -logger = logging.getLogger('sasta') - - -class NoWordDataException(Exception): - '''Raised when: - - There are no annotations for the word/level combination OR - - There is no word - ''' - pass - - -class UnalignedWord(Exception): - '''Raised when word is unaligned''' - pass - - -def get_word_levels(data: pd.DataFrame): - levels = data.level - filtered_levels = levels[~levels.isin( - [*SAF_COMMENT_HEADERS, *SAF_UTT_LEVELS])] - return list(filtered_levels.unique()) - - -def is_word_column(column_name: str) -> bool: - return column_name.lower().startswith('word') - - -def word_level_data(word_data: pd.DataFrame, colname: str): - '''returns combination word/level - ''' - if colname.lower() in SAF_UNALIGNED_LEVELS: - raise UnalignedWord - elif word_data.empty: - raise NoWordDataException - utt_data = word_data.loc[word_data.level.isin(SAF_UTT_LEVELS), colname] - return utt_data - - -class SAFReader: - def __init__(self, filepath, method, transcript: Transcript = None): - self.filepath = filepath - self.word_cols = [] - self.levels: List[str] = [] - self.data = self.loaddata(filepath) - self.method = method - self.transcript: Optional[Transcript] = transcript or None - self.item_mapping, self.patterns = self.make_mappings() - self.document = SAFDocument(os.path.basename( - filepath), method, self.levels) - self.errors: List[Tuple] = [] - self.get_annotations(self.data) - - def formatted_errors(self): - results = [] - for (utt_id, word_id, text, level, label) in self.errors: - results.append(f'Unknown item "{label}" found in utterance {utt_id}, word {word_id} ("{text}"), level "{level}"') - return results - - def loaddata(self, filepath): - data = pd.read_excel(filepath, engine='openpyxl') - data.rename(columns=standardize_header_name, inplace=True) - data = data.where(data.notnull(), None) - self.word_cols = [SAF_UNALIGNED_LEVEL.lower()] + \ - list(filter(is_word_column, data.columns)) - - # Do we need to drop empty columns? Seems we don't. If otherwise, make sure word_columns are not dropped - # data.dropna(how='all', axis=1, inplace=True) - - relevant_cols = [ - *SAF_UTT_LEVELS, - 'level', - *SAF_UNALIGNED_LEVELS, - *self.word_cols - ] - to_clean_cols = [col for col in set( - relevant_cols) if col in data.columns] - self.levels = [lv for lv in list( - data.level.dropna().unique()) if lv.lower() not in SAF_UTT_LEVELS] - - data = data[to_clean_cols].apply(clean_row, axis='columns') - - return data - - def make_mappings(self): - item_mapping = self.method.get_item_mapping(LABELSEP) - items = [item for (item, _) in item_mapping if item] - patterns = mkpatterns(items) - return item_mapping, patterns - - def get_annotations(self, data): - for utt_id in data[SAF_UTT_HEADER.lower()].unique(): - utt_rows = data[data[SAF_UTT_HEADER.lower()] == utt_id] - parsed_utterance = self.parse_utterance(utt_id, utt_rows) - self.document.utterances.append(parsed_utterance) - - def parse_utterance(self, utt_id, utt_data): - instance = SAFUtterance(utt_id) - utt_object = self.transcript.get_utterance_by_id(utt_id) - self.document.allutts[utt_object.utt_id] = utt_object.word_list - for idx, wcol in enumerate(self.word_cols): - relevant_cols = ['level', wcol] - word = self.parse_word(utt_id, idx, - wcol, utt_data[relevant_cols], utt_object.word_position_mapping) - if word: - instance.words.append(word) - - return instance - - def parse_word(self, utt_id, word_id, colname, word_data, wordposmap): - data = word_data - if colname != SAF_UNALIGNED_LEVEL.lower(): - # Don't drop data for unaligned - data = word_data.dropna() - - try: - utt_data = word_level_data(data, colname) - text = utt_data.iloc[0] - - except UnalignedWord: - text = '' - except NoWordDataException: - return None - - (begin, end) = wordposmap[word_id]['begin'], wordposmap[word_id]['end'] - instance = SAFWord(word_id, text, begin, end) - - word_levels = get_word_levels(data) - for level in word_levels: - item_data = data.loc[data.level == level, colname].iloc[0] - if not pd.isnull(item_data): - label = clean_item(item_data) - enriched_label = enrich(label, PREFIX.lower()) - split_labels = getlabels(enriched_label, self.patterns) - - if not split_labels: - self.errors.append((utt_id, word_id, text, level, label)) - - self.map_labels(split_labels, instance, - level, utt_id, word_id, text) - - # read comments - comment_data = data.loc[data.level.isin(SAF_COMMENT_HEADERS)].dropna() - if not comment_data.empty: - instance.comment = str(comment_data[colname].iloc[0]) - - return instance - - def map_labels(self, split_labels: List[str], saf_word: SAFWord, level: str, utt_id, word_id, text): - for label in split_labels: - mapped = item2queryid(label, level, self.item_mapping) - if mapped: - query_id, fase = mapped - saf_word.annotations.append(SAFAnnotation( - level, label, fase, query_id)) - self.document.exactresults[query_id].append( - (utt_id, word_id)) - - else: - logger.warning( - 'Cannot resolve query_id for (%s, %s)', level, label) - self.errors.append( - (utt_id, word_id, text, level, label)) diff --git a/backend/analysis/annotations/safreader_test.py b/backend/analysis/annotations/safreader_test.py deleted file mode 100644 index 5804b589..00000000 --- a/backend/analysis/annotations/safreader_test.py +++ /dev/null @@ -1,87 +0,0 @@ -import os.path as op -from operator import itemgetter - -import pytest -from analysis.annotations.safreader import SAFReader, get_word_levels -from pandas import DataFrame -from pytest_lazyfixture import lazy_fixture - - -@pytest.mark.skip(reason="SAFReader replaced by SASTADEV equivalent") -@pytest.mark.parametrize("method, transcript, filedir, samplenum", [ - (lazy_fixture("tarsp_method"), lazy_fixture("tarsp_transcript"), - lazy_fixture("cha_testfiles_dir"), 5), - (lazy_fixture("asta_method"), lazy_fixture("asta_transcript"), - lazy_fixture("cha_testfiles_dir"), 16) -] -) -def test_read_saf(method, transcript, filedir, samplenum): - true_results, _ = query_transcript(transcript, method, annotate=True, zc_embed=method.category.zc_embeddings) - assert not true_results.annotationinput - - reader = SAFReader(op.join(filedir, f'sample_{samplenum}_SAF.xlsx'), method, transcript) - read_results = reader.document.to_allresults() - - # are the coreresults the same? - sorted_read = sorted(read_results.coreresults.keys()) - sorted_true = sorted(true_results.coreresults.keys()) - assert sorted_read == sorted_true - for q, hits in read_results.coreresults.items(): - true_hits = true_results.coreresults[q] - assert hits == true_hits - - # are all the annotations the same? - assert true_results.annotations.keys() == reader.document.reformatted_annotations.keys() - for q, annos in true_results.annotations.items(): - true_annos = reader.document.reformatted_annotations[q] - for word in annos: - true_word = next((w for w in true_annos if w.index == word.index), None) - hits = sorted(word.hits, key=itemgetter('level', 'item')) - if true_word: - true_hits = sorted(true_word.hits, key=itemgetter('level', 'item')) - assert hits == true_hits - else: - # if the true_word is not found (unaligned empty), make sure it didnt miss anything - assert hits == [] - - # are the exactresults the same? - true_exact = {k: sorted(v) for (k, v) in true_results.exactresults.items() if v != []} - read_exact = {k: sorted(v) for (k, v) in read_results.exactresults.items() if v != []} - assert true_exact == read_exact - - # are the allutts the same? - assert true_results.allutts == read_results.allutts - - -@pytest.mark.skip(reason="SAFReader replaced by SASTADEV equivalent") -def test_astalex(asta_method, asta_transcript, asta_transcript_corrections, cha_testfiles_dir): - true_results, _ = query_transcript(asta_transcript, asta_method, annotate=True, zc_embed=False) - assert true_results.annotationinput - - assert true_results.annotations.get(3)[7].hits == [{'level': 'Taalmaat', 'item': 'N', 'fase': 0}] - - assert True - - -@pytest.mark.skip(reason="SAFReader replaced by SASTADEV equivalent") -def test_wordlevels(): - data = {'level': map(str.lower, ['Uiting', 'QA', 'SZ', - 'Grammaticale Fout', 'Opmerkingen']), - 'word1': [1, None, 'X', 'V, BvBB', 'Hier staat wat commentaar']} - df_in = DataFrame.from_dict(data) - - word_levels = get_word_levels(df_in) - assert word_levels == ['qa', 'sz', 'grammaticale fout'] - - -@pytest.mark.skip(reason="SAFReader replaced by SASTADEV equivalent") -def test_read_saf_comments(tarsp_method, tarsp_transcript, cha_testfiles_dir): - reader = SAFReader( - op.join( - cha_testfiles_dir, - 'sample_5_SAF_with_comments.xlsx'), - tarsp_method, tarsp_transcript) - sent = reader.document.utterances[3] - assert sent.words[1].comment == 'Ik vind hier iets van.' - assert sent.words[2].comment == '1' - assert sent.words[3].comment == 'En hier misschien ook wel iets van' diff --git a/backend/analysis/annotations/utils.py b/backend/analysis/annotations/utils.py deleted file mode 100644 index 5a2d7338..00000000 --- a/backend/analysis/annotations/utils.py +++ /dev/null @@ -1,105 +0,0 @@ -import logging -import re -from typing import List, Pattern, Tuple - -import pandas - -from .constants import (HEADER_VARIANTS, ITEMSEPPATTERN, LABELSEP, - NO_CLEAN_LEVELS, TupleStrDict) - -logger = logging.getLogger('sasta') - - -def standardize_header_name(header: str) -> str: - '''lowercase and standardize header''' - header = header.lower() - for key, val in HEADER_VARIANTS.items(): - if header in val: - return key - return header - - -def clean_row(row: pandas.Series) -> pandas.Series: - if row.level.lower() in NO_CLEAN_LEVELS: - row.level = row.level.lower() - return row - return row.apply(clean_cell) - - -def clean_cell(cell): - if isinstance(cell, str): - result = cell - result = result.lstrip() - result = result.rstrip() - result = result.lower() - return result - return cell - - -def mkpatterns(allcodes: List[str]) -> Tuple[Pattern, Pattern]: - basepattern = r'' - sortedallcodes = sorted(allcodes, key=len, reverse=True) - adaptedcodes = [codeadapt(c) for c in sortedallcodes] - basepattern = r'' + '|'.join(adaptedcodes) + '|' + ITEMSEPPATTERN - fullpattern = r'^(' + basepattern + r')*$' - - return (re.compile(basepattern), re.compile(fullpattern)) - - -def codeadapt(code: str) -> str: - result = code - result = re.sub(r'\.', r'\\.', result) - result = re.sub(r'\(', r'\\(', result) - result = re.sub(r'\)', r'\\)', result) - result = re.sub(r'\?', r'\\?', result) - result = re.sub(r'\*', r'\\*', result) - result = re.sub(r'\+', r'\\+', result) - result = re.sub(r' ', r'\\s+', result) - - return result - - -def enrich(labelstr: str, lcprefix: str) -> str: - if not labelstr: - return labelstr - try: - labels = labelstr.split(LABELSEP) - newlabels = [] - for label in labels: - if label != "" and lcprefix != "": - newlabels.append(lcprefix + ":" + label) - else: - newlabels.append(label) - result = LABELSEP.join(newlabels) - return result - except TypeError: - logger.warning('non-str enrich: %s %s', labelstr, type(labelstr)) - return labelstr - - -def getlabels(labelstr, patterns): - results = [] - (pattern, fullpattern) = patterns - if fullpattern.match(labelstr): - matches = pattern.finditer(labelstr) - results = [m.group(0) for m in matches if m.group(0) not in ' ;,-/'] - else: - results = [] - matches = pattern.finditer(labelstr) - logstr = str([m.group(0) for m in matches if m.group(0) not in ' ;,-']) - logger.warning('Cannot interpret %s; found items: %s', - labelstr, logstr) - return results - - -def clean_item(item: str): - clean_item = item.lower().strip() - clean_item = re.sub(pattern=r' +', repl=' ', string=clean_item) - return clean_item - - -def item2queryid(item: str, level: str, - mapping: TupleStrDict): - if (item, level) in mapping: - return mapping[(item, level)] - return None diff --git a/backend/analysis/models.py b/backend/analysis/models.py index e10cdd96..6cc29e23 100644 --- a/backend/analysis/models.py +++ b/backend/analysis/models.py @@ -7,7 +7,6 @@ from typing import Dict, List, Tuple from uuid import uuid4 -from analysis.annotations.utils import clean_item from analysis.managers import SastaQueryManager from django.contrib.auth.models import User from django.contrib.postgres.fields import ArrayField @@ -80,13 +79,6 @@ class Meta: unique_together = (('category', 'name')) get_latest_by = ('date_added', ) - def get_item_mapping(self, sep): - queries = self.queries.all() - mapping = {} - for q in queries: - mapping.update(q.get_item_mapping(sep)) - return mapping - def to_sastadev(self) -> Method: cat_name = self.category.name.lower() location = self.content.path @@ -335,19 +327,6 @@ def get_items_list(self, str, sep, lower=True): return [] return cleanresult - def get_item_mapping(self, sep): - ''' mapping of all possible items (including altitems) to this query''' - if (not self.item) or (not self.level): - return {} - result = {(clean_item(self.item), self.level.lower()): - (self.query_id, self.fase)} - if self.altitems: - for item in self.altitems: - if (clean_item(item), self.level.lower()) not in result: - result[(clean_item(item), self.level.lower())] = ( - self.query_id, self.fase) - return result - def to_sastadev(self) -> Query: sastadev_mapping = {'query_id': 'id'} processes = ['pre', 'core', 'post', 'form'] diff --git a/backend/analysis/views.py b/backend/analysis/views.py index 17f8a71e..23a7b85c 100644 --- a/backend/analysis/views.py +++ b/backend/analysis/views.py @@ -5,7 +5,6 @@ import logging from io import BytesIO, StringIO -from analysis.annotations.safreader import SAFReader from analysis.query.run import annotate_transcript from annotations.reader import read_saf from annotations.writers.querycounts import querycounts_to_xlsx From 6c5450db61a41b6d6a20d93c50f39c2549914d5c Mon Sep 17 00:00:00 2001 From: Jelte van Boheemen Date: Wed, 10 Apr 2024 14:27:27 +0200 Subject: [PATCH 27/36] Remove redundant analysis.macros module --- backend/analysis/macros/__init__.py | 0 backend/analysis/macros/functions.py | 76 -------------- backend/analysis/macros/sastamacros1.txt | 121 ----------------------- backend/analysis/macros/sastamacros2.txt | 63 ------------ backend/analysis/macros/tests.py | 30 ------ 5 files changed, 290 deletions(-) delete mode 100644 backend/analysis/macros/__init__.py delete mode 100644 backend/analysis/macros/functions.py delete mode 100644 backend/analysis/macros/sastamacros1.txt delete mode 100644 backend/analysis/macros/sastamacros2.txt delete mode 100644 backend/analysis/macros/tests.py diff --git a/backend/analysis/macros/__init__.py b/backend/analysis/macros/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/backend/analysis/macros/functions.py b/backend/analysis/macros/functions.py deleted file mode 100644 index 6e110fbe..00000000 --- a/backend/analysis/macros/functions.py +++ /dev/null @@ -1,76 +0,0 @@ -import re -import os.path as op -import logging -logger = logging.getLogger('sasta') - -idpat = r'([A-z_][A-z0-9_]*)' -eqpat = r'=' -exprpat = r'"""(.*?)"""' -whitespaces = r'\s+' - -macrocallpat = r'(%.+?%)' -macrocallre = re.compile(macrocallpat) - -macropat = idpat + whitespaces + eqpat + whitespaces + exprpat - -macrore = re.compile(macropat, re.S) - -MACROFILENAMES = ['sastamacros1.txt', 'sastamacros2.txt'] - - -def macrostrs2dict(teststrings): - macrodict = {} - for tstr in teststrings: - macromatches = macrore.finditer(tstr) - for macromatch in macromatches: - macroname = macromatch.group(1) - macroexpr = macromatch.group(2) - macrodict[macroname] = macroexpr - - return macrodict - - -def readmacros(macrofile, macrodict): - macrotext = macrofile.read() - macromatches = macrore.finditer(macrotext) - for macromatch in macromatches: - macroname = macromatch.group(1) - macroexpr = macromatch.group(2) - if macroname in macrodict: - logger.warning( - 'Duplicate macro {} encountered. Ignored'.format(macroname)) - else: - macrodict[macroname] = macroexpr - return macrodict - - -def expandmacros(expr, macrodict): - result = expandmacrosdict(expr, macrodict) - return result - - -def expandmacrosdict(expr, macrodict): - newexpr = expr - thematch = macrocallre.search(newexpr) - while thematch: - macrocall = thematch.group(1) - macroname = macrocall[1:-1] - if macroname in macrodict: - newexpr = macrocallre.sub(macrodict[macroname], newexpr) - thematch = macrocallre.search(newexpr) - else: - logger.error( - 'Unknown macro call encountered: {}.'.format(macroname)) - break - return newexpr - - -def get_macros_dict(macrofilenames=MACROFILENAMES): - macrodict = {} - for macrofilename in macrofilenames: - script_dir = op.dirname(op.abspath(__file__)) - file_path = op.join(script_dir, macrofilename) - macrofile = open(file_path, 'r', encoding='utf8') - macrodict = readmacros(macrofile, macrodict) - macrofile.close() - return macrodict diff --git a/backend/analysis/macros/sastamacros1.txt b/backend/analysis/macros/sastamacros1.txt deleted file mode 100644 index 864143c6..00000000 --- a/backend/analysis/macros/sastamacros1.txt +++ /dev/null @@ -1,121 +0,0 @@ -Tarsp_VCr1 = """node[@rel="obj1" or @rel="pc" or @rel="predc" or @rel="ld" ] """ - - -b = """number(@begin)""" -e = """number(@end)""" -single_name = """( @ntype = 'eigen' or @postag='SPEC(deeleigen)' )""" - -multi_name = """( @cat='mwu' and node[@rel='mwp' and %single_name%] ) """ - -name = """( %single_name% or %multi_name% )""" - -name_phrase= """( %name% or node[@rel="hd" and %name%] )""" - -booster = """(@lemma="allemachtig" or @lemma="beestachtig" or @lemma="bijzonder" or @lemma="bliksems" or @lemma="bloedig" or @lemma="bovenmate" or @lemma="buitengewoon" or @lemma="buitenmate" or @lemma="buitensporig" or @lemma="crimineel" or @lemma="deerlijk" or @lemma="deksels" or @lemma="donders" or @lemma="drommels" or @lemma="eindeloos" or @lemma="enorm" or @lemma="erbarmelijk" or @lemma="fantastisch" or @lemma="formidabel" or @lemma="geweldig" or @lemma="goddeloos" or @lemma="godsjammerlijk" or @lemma="grenzeloos" or @lemma="grotelijks" or @lemma="heel" or @lemma="ijselijk" or @lemma="ijzig" or @lemma="intens" or @lemma="krankzinnig" or @lemma="machtig" or @lemma="mirakels" or @lemma="monsterachtig" or @lemma="moorddadig" or @lemma="oneindig" or @lemma="onnoemelijk" or @lemma="ontiegelijk" or @lemma="ontstellend" or @lemma="ontzaglijk" or @lemma="ontzettend" or @lemma="onuitsprekelijk" or @lemma="onvoorstelbaar" or @lemma="onwezenlijk" or @lemma="onwijs" or @lemma="overweldigend" or @lemma="peilloos" or @lemma="reusachtig" or @lemma="reuze" or @lemma="schrikkelijk" or @lemma="sterk" or @lemma="uiterst" or @lemma="verdomd" or @lemma="verdraaid" or @lemma="verduiveld" or @lemma="verduveld" or @lemma="verrekt" or @lemma="verrot" or @lemma="verschrikkelijk" or @lemma="vervloekt" or @lemma="vreselijk" or @lemma="waanzinnig" or @lemma="zeer" or @lemma="zeldzaam" or @lemma="zwaar" )""" - -JO_v3 = """ - count(.//node[@pt="ww"])>=3 and - (some $v1 in .//node[@pt="ww"], - $v2 in .//node[@pt="ww"], - $v3 in .//node[@pt="ww"] - satisfies ($v1/%PQ_e% < $v2/%PQ_e% and - $v2/%PQ_e% < $v3/%PQ_e% and - $v1/%PQ_e% < $v3/%PQ_e% - ) - )""" - -JO_kijken_naar = """ parent::node[@cat="pp" and - node[@lemma="naar" and @rel= "hd"] and - parent::node[ node[@pt= "ww" and - @rel= "hd" and - @lemma= "kijken" - ] - ] - ] -""" - -Tarsp_hww = """ - (@lemma="kunnen" or - @lemma = "moeten" or - @lemma= "hoeven" or - @lemma = "blijven" or - @lemma = "willen" or - @lemma = "zullen" or - @lemma = "doen" or - @lemma = "gaan" or - @lemma = "horen" or - @lemma = "komen" or - @lemma = "laten" or - @lemma = "liggen" or - @lemma = "lopen" or - @lemma = "mogen" or - @lemma = "staan" or - @lemma = "zitten ") - """ - -Tarsp_OndWBVC = """ -(@cat="smain" or @cat="ssub") and node[@rel="su"] and node[@rel="hd" and @pt="ww"] and - ((count(node[@rel!="svp"]) = 4 and node[@rel="mod"] and %Tarsp_VCr1% ) or - (count(node) = 3 and node[node[@rel="mod"] and %Tarsp_VCr1% ]) - ) -""" - -Tarsp_OndWBB = """ -(@cat="smain" or @cat="ssub") and node[@rel="su"] and node[@rel="hd" and @pt="ww"] and - ((count(node[@rel!="svp"]) = 4 and count(node[@rel="mod" or @rel="ld"]) = 2 ) or - (count(node) = 3 and node[@rel="vc" and (@cat="inf" or @cat="ppart") and count(node[@rel="mod" or @rel="ld"]) =2 ]) - ) -""" - -ASTA_pred = """(@rel="predc" or @rel="predm" or (@rel="hd" and parent::node[@rel="predc" or @rel="predm"]))""" - -ASTA_attr = """((@rel="mod" and parent::node[node[@rel="hd" and (@pt="n" or @pt="vnw")]] ) or - (@rel="hd" and parent::node[@rel="mod" and parent::node[node[@rel="hd" and (@pt="n" or @pt="vnw")]]]))""" - - -ASTA_adverbial = """ - ((@rel="mod" and parent::node[node[@rel="hd" and not(@pt="n" or @pt="vnw")]] ) or - (@rel="hd" and parent::node[@rel="mod" and parent::node[node[@rel="hd" and not(@pt="n" or @pt="vnw")]]])) - """ - - -ASTA_modalww = """ (@lemma="zullen" or @lemma="willen" or @lemma="moeten" or @lemma="mogen" or @lemma="kunnen") """ - - -ASTA_kopww = """ (@pt="ww" and @rel="hd" and @lemma!="uit_zien" and @lemma!="heten" and @lemma!="gaan" and @lemma!="zitten" and parent::node[node[@rel="predc"] and not(node[@rel="obj1"])] )""" - -ASTA_wantmaarbijzin = """ -((@cat="smain" or (@cat="du" and node[@cat="smain" and @rel="nucl"])) and - ../node[(@word="want" or @word="maar") and @rel="crd" and @pt="vg"] and - @begin>=../node[(@word="want" or @word="maar")]/@end) -""" - -ASTA_dusbijzin = """ -(@cat="smain" and node[@lemma="dus" and @begin=parent::node/@begin and @pt="bw" and @rel="mod"]) -""" - -ASTA_detadjs = """ - (@pt="vnw" and (@rel="mod" or @rel="det") and @vwtype="onbep" and parent::node[@cat="np"] and -(@lemma="al" or @lemma="beide" or @lemma="elk" or @lemma="enig" or @lemma="enkel" or @lemma="geen" or @lemma="ieder" or - @lemma="meer" or @lemma="meerdere" or @lemma="meest" or @lemma="menig" or @lemma="minder" or - @lemma="minst" or @lemma="sommige" or @lemma="veel" or @lemma="weinig")) - -""" - -Tarsp_kijkeens = """ -(@cat="sv1" and @rel != "--" and - node[@pt="ww" and @lemma="kijken" and @rel="hd" and @pvagr="ev" and @pvtijd="tgw" ] and - node[@lemma="eens" and @rel="mod" and @pt="bw"] and count(node)=2) -""" - -ASTA_numeral = """ -(@lemma="twee" or @lemma="drie" or @lemma="vier" or @lemma="vijf" or @lemma="zes" or @lemma="zeven" or @lemma="acht" or @lemma="negen" or @lemma="tien" or @lemma="elf" or @lemma="twaalf" or @lemma="dertien" or @lemma="veertien" or @lemma="vijftien" or @lemma="zestien" or @lemma="zeventien" or @lemma="achttien" or @lemma="negentien" or @lemma="twintig" or @lemma="eentje" or @lemma="tweetjes" or @lemma="drietjes" or @lemma="viertjes" or @lemma="vijfjes" or @lemma="zesjes") -""" - -ASTA_filled_pause = """ -(@lemma= "uh" or @lemma ="Uh" or @lemma="Uhm" or @lemma= "uhm" or @lemma = "euh"or @lemma = "eh" or @lemma = "goh" or @word="xxx" or @word="XXX") -""" - - - - \ No newline at end of file diff --git a/backend/analysis/macros/sastamacros2.txt b/backend/analysis/macros/sastamacros2.txt deleted file mode 100644 index 26b2f5d9..00000000 --- a/backend/analysis/macros/sastamacros2.txt +++ /dev/null @@ -1,63 +0,0 @@ -STAP_geen_BB = """ -not(@lemma="al" - or @lemma="dan" - or @lemma="dus" - or @lemma="eens" - or @lemma="gewoon" - or @lemma="meer" - or @lemma="niet" - or @lemma="nog" - or @lemma="nou" - or @lemma="nu" - or @lemma="ook" - or @lemma="toch" - or @lemma="toen" - or @lemma="weer" - or @lemma="wel" - or @lemma="zo") -""" - - -STAP_BB_t = """ -not((((@frame="tmp_adverb" - or @frame="adjective(both(tmpadv))" - and not(parent::node[@rel="mod"])) - or ((@cat="pp" - or @rel="mod") - and (node[@special="tmp" - or node[@special="tmp"]])))) - or (@cat="pp" and node[@pt="vz" - and (@lemma="sinds" or @lemma="gedurende" - or @lemma="na")]) - or (@cat="np" and @rel="mod" - and node[@lemma="elk" or @rel="det"] - and node[@special="tmp"]) - or (@rel="mod" and - (@lemma="net" or @lemma="gauw" - or @lemma="vroeger" or @lemma="toen" - or @lemma="soms" or @lemma="altijd"))) -""" - -STAP_BB_p = """ -not(((@cat="pp" - and @rel="ld") - or @frame="waar_adverb(naar)" - or @frame="waar_adverb(in)" - or @frame="waar_adverb(heen)" - or @frame="er_wh_loc_adverb" - or @frame="wh_loc_adverb" - or @frame="er_vp_adverb" - or @frame="er_adverb(uit)" - or @frame="er_loc_adverb" - or @frame="loc_adverb" - or (@cat="pp" - and node[@pt="vz" - and (@lemma="op" - or @lemma="bij" - or @lemma="in")])) - or @cat="pp" and node[@pt="vz" - and (@lemma="op" or @lemma="boven" - or @lemma="onder" or @lemma="tussen" - or @lemma="naast" or @lemma="achter" - or @lemma="bij" or @lemma="naar")]) -""" diff --git a/backend/analysis/macros/tests.py b/backend/analysis/macros/tests.py deleted file mode 100644 index 5535b9c2..00000000 --- a/backend/analysis/macros/tests.py +++ /dev/null @@ -1,30 +0,0 @@ -import pytest -from .functions import expandmacros, get_macros_dict -import os.path as op - -# flake8: noqa: E501 - -TESTSTRINGS = ['b = """number(@begin)"""', 'e = """number(@end)"""', - 'single_name = """( @ntype = "eigen" or @postag="SPEC(deeleigen)" )"""', - 'multi_name = """( @cat=''mwu'' and node[@rel=''mwp'' and %single_name%] ) """', - 'name = """( %single_name% or %multi_name% )"""', - 'name_phrase= """( %name% or node[@rel="hd" and %name%] )"""'] -TESTQUERIES = [('//node[%b%="3"]', '//node[number(@begin)="3"]'), - ('//node[%single_name%]', - "//node[( @ntype = 'eigen' or @postag='SPEC(deeleigen)' )]"), - ("//node[%multi_name%]", - "//node[( @cat='mwu' and node[@rel='mwp' and ( @ntype = 'eigen' or @postag='SPEC(deeleigen)' )] ) ]"), - ("//node[%fout%]", "//node[%fout%]")] - -MACROFILENAMES = ['sastamacros1.txt', - 'sastamacros2.txt'] -HERE = op.dirname(op.abspath(__file__)) -MACROFILENAMES = [op.join(HERE, fn) for fn in MACROFILENAMES] - - -@pytest.mark.parametrize('short, long', TESTQUERIES) -def test_macro_expansion(short, long): - macrodict = get_macros_dict(MACROFILENAMES) - for (short, long) in TESTQUERIES: - expansion = expandmacros(short, macrodict) - assert expansion == long From e4aeabdd4a97ffbef94ab26d2c1469644561ebe9 Mon Sep 17 00:00:00 2001 From: Jelte van Boheemen Date: Wed, 10 Apr 2024 14:27:56 +0200 Subject: [PATCH 28/36] Remove redundant zc_embedding module --- backend/analysis/score/__init__.py | 0 .../score/testfiles/zc_embed_test.xml | 28 ------------ backend/analysis/score/zc_embedding.py | 45 ------------------- backend/analysis/score/zc_embedding_test.py | 16 ------- 4 files changed, 89 deletions(-) delete mode 100644 backend/analysis/score/__init__.py delete mode 100644 backend/analysis/score/testfiles/zc_embed_test.xml delete mode 100644 backend/analysis/score/zc_embedding.py delete mode 100644 backend/analysis/score/zc_embedding_test.py diff --git a/backend/analysis/score/__init__.py b/backend/analysis/score/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/backend/analysis/score/testfiles/zc_embed_test.xml b/backend/analysis/score/testfiles/zc_embed_test.xml deleted file mode 100644 index 6f4d9eae..00000000 --- a/backend/analysis/score/testfiles/zc_embed_test.xml +++ /dev/null @@ -1,28 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - moet ook ergens een bakje waar dit in kan - - \ No newline at end of file diff --git a/backend/analysis/score/zc_embedding.py b/backend/analysis/score/zc_embedding.py deleted file mode 100644 index 06a52f0a..00000000 --- a/backend/analysis/score/zc_embedding.py +++ /dev/null @@ -1,45 +0,0 @@ - -from typing import Dict - -CLAUSALS = ['smain', 'rel', 'whrel', 'whsub', 'whq', 'sv1'] - - -def is_token(node): - return any(x in ['pt', 'pos'] for x in node.keys()) - - -def has_cat(node): - return 'cat' in node.keys() - - -def is_clausal(node): - return is_direct_clausal(node) or is_child_clausal(node) - - -def is_direct_clausal(node): - return node.attrib.get('cat') in CLAUSALS - - -def is_child_clausal(node): - return node.attrib.get('cat') in ['cp'] and \ - any(n.attrib.get('cat') in ['ssub', 'ti'] for n in list(node)) - - -def solve(node, embed, results): - if is_token(node): - results[node.attrib['begin']] = embed - if has_cat(node) and is_clausal(node): - embed += 1 - for child in node.getchildren(): - solve(child, embed, results) - return results - - -def get_zc_embeddings(syntree) -> Dict[str, int]: - try: - root = syntree.getroot() - except Exception: - root = syntree - top_node = root.find('node') - results = solve(top_node, 0, {}) - return results diff --git a/backend/analysis/score/zc_embedding_test.py b/backend/analysis/score/zc_embedding_test.py deleted file mode 100644 index 79071521..00000000 --- a/backend/analysis/score/zc_embedding_test.py +++ /dev/null @@ -1,16 +0,0 @@ -import os.path as op - -from lxml import etree as ET - -from .zc_embedding import get_zc_embeddings - -HERE = op.dirname(op.abspath(__file__)) -FILES = op.join(HERE, 'testfiles') - - -def test_zc_embed(): - tree = ET.parse(op.join(FILES, 'zc_embed_test.xml')) - expected_embeddings = [1, 1, 1, 1, 1, 2, 2, 2, 2] - word_indices = [str(x) for x in range(0, 9)] - expected = dict(zip(word_indices, expected_embeddings)) - assert expected == get_zc_embeddings(tree) From 8e56b3712da4558ce6abc2144f1533116069c9b6 Mon Sep 17 00:00:00 2001 From: Jelte van Boheemen Date: Wed, 10 Apr 2024 14:38:48 +0200 Subject: [PATCH 29/36] Fix being able to correct nonexisting annotations --- frontend/src/app/transcript/transcript.component.ts | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/frontend/src/app/transcript/transcript.component.ts b/frontend/src/app/transcript/transcript.component.ts index 68e93be2..cd1dcceb 100644 --- a/frontend/src/app/transcript/transcript.component.ts +++ b/frontend/src/app/transcript/transcript.component.ts @@ -23,6 +23,8 @@ import { TranscriptService, } from '@services'; +import _ from 'lodash'; + const XLSX_MIME = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'; const TXT_MIME = 'text/plain'; @@ -72,15 +74,19 @@ export class TranscriptComponent implements OnInit, OnDestroy { ); } + hasLatestRun(): boolean { + return !_.isNil(this.transcript.latest_run); + } + allowCorrectionUpload(): boolean { return ( this.transcript.status === TranscriptStatus.PARSED && - this.transcript.latest_run !== undefined + this.hasLatestRun() ); } allowCorrectionReset(): boolean { - return this.transcript.latest_run !== undefined; + return this.hasLatestRun(); } allowScoring(): boolean { From 8224dcec17eec2e55b435e37450b17ea4f20b247 Mon Sep 17 00:00:00 2001 From: Jelte van Boheemen Date: Wed, 24 Apr 2024 11:18:20 +0200 Subject: [PATCH 30/36] Fix SAF reading --- backend/annotations/reader.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/backend/annotations/reader.py b/backend/annotations/reader.py index 99cf4c56..03f11de2 100644 --- a/backend/annotations/reader.py +++ b/backend/annotations/reader.py @@ -6,8 +6,7 @@ def read_saf(saf_filename: str, method: Method, includeimplies: bool = False) -> AllResults: '''Wrapper around SASTADEV SAF reader''' infilename = saf_filename - allutts, richexactscores = get_golddata(infilename, method.item2idmap, method.altcodes, - method.queries, includeimplies) + allutts, richexactscores = get_golddata(infilename, method, includeimplies) exactresults = richscores2scores(richexactscores) annotatedfileresults = AllResults(uttcount=len(allutts), coreresults={}, From 89253085d8cf5bb4cc255f6e6db18340d8d4e0ed Mon Sep 17 00:00:00 2001 From: Jelte van Boheemen Date: Wed, 24 Apr 2024 13:55:52 +0200 Subject: [PATCH 31/36] Move anonymizations file to sastadev --- backend/analysis/convert/replacements.py | 7 ++-- backend/anonymization.json | 49 ------------------------ backend/conftest.py | 2 +- 3 files changed, 5 insertions(+), 53 deletions(-) delete mode 100644 backend/anonymization.json diff --git a/backend/analysis/convert/replacements.py b/backend/analysis/convert/replacements.py index 93394d0c..ab68cbbc 100644 --- a/backend/analysis/convert/replacements.py +++ b/backend/analysis/convert/replacements.py @@ -2,11 +2,11 @@ from string import ascii_lowercase import os.path as op import json -from django.conf import settings +from sastadev.conf import settings as sdsettings def instantiate_anonymizations(): - json_path = op.join(settings.BASE_DIR, 'anonymization.json') + json_path = op.join(sdsettings.SD_DIR, 'data', 'anonymization.json') with open(json_path, 'r') as f: return json.load(f) @@ -35,7 +35,8 @@ def fill_name(string): def repl(match): raw_index = match.group(3) or '0' - index = int(raw_index) if raw_index.isnumeric() else letter_index(raw_index) + index = int(raw_index) if raw_index.isnumeric( + ) else letter_index(raw_index) repl = specs['common'][index] return match.group(1) + repl + match.group(4) diff --git a/backend/anonymization.json b/backend/anonymization.json deleted file mode 100644 index 60edeec1..00000000 --- a/backend/anonymization.json +++ /dev/null @@ -1,49 +0,0 @@ -[ - { - "category": "place", - "codes": ["PLAATS", "PLAATSNAAM", "WOONPLAATS"], - "common": ["Utrecht", "Breda", "Leiden", "Maastricht", "Arnhem"] - }, - { - "category": "lastname", - "codes": ["ACHTERNAAM"], - "common": ["Jansen", "Hendriks", "Dekker", "Dijkstra", "Veenstra"] - }, - { - "category": "person", - "codes": ["NAAM", "BROER", "ZUS", "KIND", "VADER", "MOEDER"], - "common": ["Maria", "Jan", "Anna", "Esther", "Pieter", "Sam"] - }, - { - "category": "profession", - "codes": ["BEROEP"], - "common": ["timmerman", "chirurgh", "leraar", "ober", "verslaggever"] - }, - { - "category": "country", - "codes": ["LAND"], - "common": ["Duitsland", "Nederland", "Japan", "Kameroen", "India"] - }, - { - "category": "education", - "codes": ["STUDIE", "OPLEIDING"], - "common": [ - "bedrijfskunde", - "informatica", - "filosofie", - "rechtsgeleerdheid", - "werktuigbouwkunde" - ] - }, - { - "category": "institution", - "codes": ["ZORGINSTELLING", "INSTELLING", "ZIEKENHUIS"], - "common": [ - "Diakonessenhuis", - "Rijnstate", - "Vogellanden", - "HagaZiekenhuis", - "Slingeland" - ] - } -] diff --git a/backend/conftest.py b/backend/conftest.py index 8deeed8c..66f4992c 100644 --- a/backend/conftest.py +++ b/backend/conftest.py @@ -70,7 +70,7 @@ def tarsp_method(db, tarsp_category, method_dir): @pytest.fixture def asta_method(db, asta_category, method_dir): - file = glob.glob(f'{method_dir}/ASTA Index Current.xlsx')[0] + file = glob.glob(f'{method_dir}/ASTA_Index_Current.xlsx')[0] with open(file, 'rb') as f: wrapped_file = File(f) instance = AssessmentMethod( From affb71b4d23d7402fec7098f5a51742562a40129 Mon Sep 17 00:00:00 2001 From: Jelte van Boheemen Date: Wed, 24 Apr 2024 14:26:20 +0200 Subject: [PATCH 32/36] Add tests for new anonymizations --- backend/analysis/convert/tests/conftest.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/backend/analysis/convert/tests/conftest.py b/backend/analysis/convert/tests/conftest.py index 8ffd27c3..dc29e31b 100644 --- a/backend/analysis/convert/tests/conftest.py +++ b/backend/analysis/convert/tests/conftest.py @@ -131,4 +131,9 @@ def example_utterances(): 'exp_text': 'Ik heet Jan en hij heet Anna.', 'exp_tiers': {'xano': '8|NAAM1|Jan, 24|NAAM2|Anna'}, }, + { + 'text': 'Ik zit op de SCHOOL1 en hij op de SCHOOL2.', + 'exp_text': 'Ik zit op de Mariaschool en hij op de Calvijnschool.', + 'exp_tiers': {'xano': '13|SCHOOL1|Mariaschool, 38|SCHOOL2|Calvijnschool'}, + }, ] From dcc928413da69a2f9bea104a75f7bf1f23c2ccc3 Mon Sep 17 00:00:00 2001 From: Jelte van Boheemen Date: Thu, 25 Apr 2024 14:58:26 +0200 Subject: [PATCH 33/36] prepare_parameters does not need a special case for annotationinput --- backend/analysis/query/query_transcript.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/backend/analysis/query/query_transcript.py b/backend/analysis/query/query_transcript.py index c888add9..a74b1689 100644 --- a/backend/analysis/query/query_transcript.py +++ b/backend/analysis/query/query_transcript.py @@ -9,14 +9,6 @@ def prepare_parameters(infilename: str, method: Method, targets: int, annotationinput: bool) -> SastaCoreParameters: - # TODO: check corr/corrn - - if annotationinput: - # If existing annotations exist - # dont supply origtreebank - # infilename becomes the path to existing SAF file - pass - return SastaCoreParameters( annotationinput=annotationinput, themethod=method, From 94c2fdfb06d49f3a6942b560b2fbfc3635c7582b Mon Sep 17 00:00:00 2001 From: Jelte van Boheemen Date: Fri, 26 Apr 2024 10:56:31 +0200 Subject: [PATCH 34/36] Naturally sort string utterance keys --- backend/annotations/writers/saf_xlsx.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/backend/annotations/writers/saf_xlsx.py b/backend/annotations/writers/saf_xlsx.py index 34f779dd..7dbf1010 100644 --- a/backend/annotations/writers/saf_xlsx.py +++ b/backend/annotations/writers/saf_xlsx.py @@ -13,6 +13,7 @@ from annotations.constants import (POST_WORDS_HEADERS, PRE_WORDS_HEADERS, SAF_COMMENT_LEVEL, SAF_UTT_LEVEL) from annotations.utils import autosize_columns, format_worksheet, get_max_words, ljust +from natsort import natsorted @dataclass @@ -104,7 +105,7 @@ def _make_levels_rows(self, ws: Worksheet) -> None: row_size = len(self.anno_headers) all_levels = self.method_category.levels + [SAF_COMMENT_LEVEL] - for utt_id, words in sorted(self.results.allutts.items(), + for utt_id, words in natsorted(self.results.allutts.items(), key=lambda x: x[0]): ws.append(self._uttlevel_row(utt_id, words)) for level in all_levels: From 28194644aa6f4cc64f5af6a48edc953d0e13419f Mon Sep 17 00:00:00 2001 From: Jelte van Boheemen Date: Fri, 26 Apr 2024 11:06:26 +0200 Subject: [PATCH 35/36] Bump sastadev --- backend/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/requirements.txt b/backend/requirements.txt index cc10a01d..77541270 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -212,7 +212,7 @@ requests==2.28.1 # spacy requests-oauthlib==1.3.1 # via django-allauth -sastadev==0.1.5 +sastadev==0.2.0 # via # -r requirements.in # auchann From ed3bfda0bb86c251ba2879190c7be8ca651890fe Mon Sep 17 00:00:00 2001 From: Jelte van Boheemen Date: Fri, 26 Apr 2024 11:09:34 +0200 Subject: [PATCH 36/36] Bump versions and changelog --- CHANGELOG.md | 7 +++++++ CITATION.cff | 2 +- frontend/src/environments/version.ts | 2 +- package.json | 2 +- 4 files changed, 10 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9fa500a8..be0da2bd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,13 @@ All notable changes (beginning at version 0.2.0) to this project will be documen The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.9.0] - 2024-04-26 +### Changed +Drastic changes to the way analysis is performed, replacing functionality by SASTADEV counterparts: +- reading SAF files +- performing analysis +- changed the SAF writer + ## [0.8.2] - 2024-04-24 ### Changed Upgraded SASTADEV dependency diff --git a/CITATION.cff b/CITATION.cff index 8bd64865..7755d98d 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -41,5 +41,5 @@ abstract: >- transcripts, to aid clinical linguists and research into language development and language disorders. license: BSD-3-Clause -version: 0.8.2 +version: 0.9.0 date-released: '2024-01-31' diff --git a/frontend/src/environments/version.ts b/frontend/src/environments/version.ts index a69c88f7..40fc290e 100644 --- a/frontend/src/environments/version.ts +++ b/frontend/src/environments/version.ts @@ -1,2 +1,2 @@ // TODO: Compile this from toplevel package.json -export const version = '0.8.2'; +export const version = '0.9.0'; diff --git a/package.json b/package.json index 74886011..cfafca77 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "sasta", - "version": "0.8.2", + "version": "0.9.0", "description": "Annotate and analyze transcripts", "author": "UU Digital Humanities Lab", "license": "BSD-3-Clause",