diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9fa500a8..be0da2bd 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,13 @@ All notable changes (beginning at version 0.2.0) to this project will be documen
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+## [0.9.0] - 2024-04-26
+### Changed
+Drastic changes to the way analysis is performed, replacing functionality by SASTADEV counterparts:
+- reading SAF files
+- performing analysis
+- changed the SAF writer
+
## [0.8.2] - 2024-04-24
### Changed
Upgraded SASTADEV dependency
diff --git a/CITATION.cff b/CITATION.cff
index 8bd64865..7755d98d 100644
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -41,5 +41,5 @@ abstract: >-
transcripts, to aid clinical linguists and research into
language development and language disorders.
license: BSD-3-Clause
-version: 0.8.2
+version: 0.9.0
date-released: '2024-01-31'
diff --git a/backend/analysis/annotations/annotation_format.py b/backend/analysis/annotations/annotation_format.py
deleted file mode 100644
index 853a3685..00000000
--- a/backend/analysis/annotations/annotation_format.py
+++ /dev/null
@@ -1,125 +0,0 @@
-import operator
-from collections import Counter, defaultdict
-from functools import reduce
-from typing import Dict, List, Optional
-
-from analysis.models import AssessmentMethod
-from analysis.results.results import (AllResults, SastaAllUtts, SastaAnnotations, SastaExactResults,
- UtteranceWord)
-
-
-class SAFAnnotation:
- def __init__(self, level, label, fase=None, query_id=None):
- self.level: str = level
- self.label: str = label
- self.fase: str = fase
- self.query_id: Optional[str] = query_id
-
-
-class SAFDocument:
- def __init__(self, name, method, all_levels=None):
- self.name: str = name
- self.method: AssessmentMethod = method
- self.utterances: List[SAFUtterance] = []
- self.all_levels: Optional[List[str]] = all_levels
- self.annotations: SastaAnnotations = {}
- self.exactresults: SastaExactResults = defaultdict(list)
- self.allutts: SastaAllUtts = defaultdict(list)
-
- @property
- def all_annotations(self):
- return reduce(operator.concat,
- [utt.annotations for utt in self.utterances])
-
- @property
- def queries(self):
- '''Set of all query IDs in the document'''
- return set(
- ann.query_id for ann in self.all_annotations
- )
-
- @property
- def item_counts(self):
- return {u.utt_id: u.item_counts for u in self.utterances}
-
- def to_allresults(self) -> AllResults:
- '''Convert to AllResults object (for query and scoring).'''
- filename = self.name
- uttcount = len(self.utterances)
- results = {
- q: Counter({
- u.utt_id: u.item_counts[q]
- for u in self.utterances
- if u.item_counts[q] > 0
- })
- for q in self.queries
- }
-
- allresults = AllResults(
- filename,
- uttcount,
- coreresults=results,
- exactresults=self.exactresults,
- allutts=self.allutts
- )
-
- return allresults
-
- @property
- def reformatted_annotations(self) -> Dict[int, List[UtteranceWord]]:
- annotations = {}
- for utt in self.utterances:
- annotations[utt.utt_id] = []
- for word in utt.words:
- uw = UtteranceWord(
- word=word.text,
- begin=word.idx - 1, # TODO: does this need to be normalized?
- end=word.idx,
- hits=[],
- idx=word.idx,
- zc_embedding=0, # TODO: CHECK ZC EMBEDS,
- comments=word.comment
- )
- for ann in word.annotations:
- hit = self.hit_from_annotation(ann)
- uw.hits.append(hit)
- annotations[utt.utt_id].append(uw)
- return annotations
-
- def hit_from_annotation(self, ann) -> Dict:
- q = self.method.queries.get(query_id=ann.query_id)
- # Try to match the actual alt item
- item_matches = [ai for ai in q.altitems if ai.lower() == ann.label.lower()]
- return {
- 'level': q.level,
- 'item': item_matches[0] if item_matches else q.item,
- 'fase': q.fase
- }
-
-
-class SAFUtterance:
- def __init__(self, utt_id):
- self.utt_id: int = utt_id
- self.words: List[SAFWord] = []
-
- @property
- def item_counts(self):
- return sum([w.item_counts for w in self.words], Counter())
-
- @property
- def annotations(self):
- return reduce(operator.concat, [w.annotations for w in self.words])
-
-
-class SAFWord:
- def __init__(self, idx, text, begin, end, comment=None):
- self.idx: int = idx
- self.begin: int = begin
- self.end: int = end
- self.text: str = text
- self.annotations: List[SAFAnnotation] = []
- self.comment: str = comment or ''
-
- @property
- def item_counts(self):
- return Counter({a.query_id for a in self.annotations if a.query_id})
diff --git a/backend/analysis/annotations/constants.py b/backend/analysis/annotations/constants.py
deleted file mode 100644
index 7340b3c5..00000000
--- a/backend/analysis/annotations/constants.py
+++ /dev/null
@@ -1,25 +0,0 @@
-from typing import Counter, Dict, Optional, Tuple
-
-# Type annotations
-TupleStrDict = Dict[Tuple[Optional[str], Optional[str]], str]
-CounterDict = Dict[str, Counter[str]]
-
-# Global
-ITEMSEPPATTERN = r'[,-; ]'
-LABELSEP = ','
-UTTLEVEL = 'utt'
-HEADER_VARIANTS = {
- 'speaker': ['speaker', 'spreker', 'spk'],
- 'utt_id': ['id', 'utt', 'uttid'],
- 'level': ['level'],
- 'phase': ['fases', 'stages'],
- 'comments': ['comments', 'commentaar']
-}
-PREFIX = ""
-ALTITEMSEP = IMPLIESSEP = ','
-SAF_COMMENT_LEVEL = 'Commentaar'
-SAF_UNALIGNED_LEVEL = 'Unaligned'
-
-# Define (lowercased) levels that should not be cleaned
-# Currently, only comment rows should be excempt
-NO_CLEAN_LEVELS = (SAF_COMMENT_LEVEL.lower(), )
diff --git a/backend/analysis/annotations/enrich_chat.py b/backend/analysis/annotations/enrich_chat.py
deleted file mode 100644
index cfb43e2c..00000000
--- a/backend/analysis/annotations/enrich_chat.py
+++ /dev/null
@@ -1,42 +0,0 @@
-import itertools
-from typing import List
-
-from analysis.models import AssessmentMethod, Transcript
-from analysis.results.results import AllResults
-from chamd.chat_reader import ChatLine, ChatTier
-from convert.chat_reader import ChatDocument
-
-
-def find_doc_line(lines: List[ChatLine], uttno: int) -> ChatLine:
- # TODO: more efficient way to do this?
- return next((x for x in lines if x.uttid == uttno), None)
-
-
-def enrich_chat(transcript: Transcript,
- allresults: AllResults,
- method: AssessmentMethod) -> ChatDocument:
- doc = ChatDocument.from_chatfile(transcript.content.path, transcript.corpus.method_category)
-
- # construct a mapping of uttno to uttid
- # because uttid is unknown to CHAT
- marked_utts = (x for x in transcript.utterances.all() if x.for_analysis)
- id_no_mapping = {
- u.utt_id: u.uttno for u in marked_utts
- }
-
- items = sorted(allresults.annotations.items())
- for utt_id, words in items:
- uttno = id_no_mapping.get(utt_id)
- doc_line = find_doc_line(doc.lines, uttno)
- flattened_hits = itertools.chain(*(w.hits for w in words))
- annotations = [x.get('item') for x in flattened_hits]
- if annotations:
- annotation_str = ', '.join(annotations)
- doc_line.tiers['xsyn'] = ChatTier(id='xsyn', text=annotation_str)
- # id_headers = [h for h in doc.headers if h.line.startswith('@ID')]
- # last_id_header = max(id_headers, key=attrgetter('linestartno'))
- # doc.headers.append(ChatHeader(
- # line=f'@Comment:\tAnnotations on %xsyn-tiers generated by SASTA, using {method.category.name}',
- # linestartno=last_id_header.linestartno+1))
-
- return doc
diff --git a/backend/analysis/annotations/safreader.py b/backend/analysis/annotations/safreader.py
deleted file mode 100644
index 91650b9a..00000000
--- a/backend/analysis/annotations/safreader.py
+++ /dev/null
@@ -1,167 +0,0 @@
-import logging
-import os
-from typing import List, Optional, Tuple
-
-import pandas as pd
-from analysis.models import Transcript
-
-from .annotation_format import (SAFAnnotation, SAFDocument, SAFUtterance,
- SAFWord)
-from .constants import (LABELSEP, PREFIX, SAF_COMMENT_LEVEL,
- SAF_UNALIGNED_LEVEL, UTTLEVEL)
-from .utils import (clean_item, clean_row, enrich, getlabels, item2queryid,
- mkpatterns, standardize_header_name)
-
-logger = logging.getLogger('sasta')
-
-
-class NoWordDataException(Exception):
- '''Raised when:
- - There are no annotations for the word/level combination OR
- - There is no word
- '''
- pass
-
-
-class UnalignedWord(Exception):
- '''Raised when word is unaligned'''
- pass
-
-
-def get_word_levels(data: pd.DataFrame):
- levels = data.level
- filtered_levels = levels[~levels.isin([SAF_COMMENT_LEVEL.lower(), UTTLEVEL.lower()])]
- return list(filtered_levels.unique())
-
-
-def is_word_column(column_name: str) -> bool:
- return column_name.lower().startswith('word')
-
-
-def word_level_data(word_data: pd.DataFrame, colname: str):
- '''returns combination word/level
- '''
- if colname.lower() == SAF_UNALIGNED_LEVEL.lower():
- raise UnalignedWord
- elif word_data.empty:
- raise NoWordDataException
- utt_data = word_data.loc[word_data.level == UTTLEVEL, colname]
- return utt_data
-
-
-class SAFReader:
- def __init__(self, filepath, method, transcript: Transcript = None):
- self.filepath = filepath
- self.word_cols = []
- self.levels: List[str] = []
- self.data = self.loaddata(filepath)
- self.method = method
- self.transcript: Optional[Transcript] = transcript or None
- self.item_mapping, self.patterns = self.make_mappings()
- self.document = SAFDocument(os.path.basename(
- filepath), method, self.levels)
- self.errors: List[Tuple] = []
- self.get_annotations(self.data)
-
- def formatted_errors(self):
- results = []
- for (utt_id, word_id, text, level, label) in self.errors:
- results.append(f'Unknown item "{label}" found in utterance {utt_id}, word {word_id} ("{text}"), level "{level}"')
- return results
-
- def loaddata(self, filepath):
- data = pd.read_excel(filepath, engine='openpyxl')
- data.rename(columns=standardize_header_name, inplace=True)
- data = data.where(data.notnull(), None)
- self.word_cols = [SAF_UNALIGNED_LEVEL.lower()] + list(filter(is_word_column, data.columns))
-
- # Do we need to drop empty columns? Seems we don't. If otherwise, make sure word_columns are not dropped
- # data.dropna(how='all', axis=1, inplace=True)
-
- relevant_cols = ['utt_id', 'level'] + self.word_cols
- self.levels = [lv for lv in list(
- data.level.dropna().unique()) if lv.lower() != UTTLEVEL]
-
- data = data[relevant_cols].apply(clean_row, axis='columns')
-
- return data
-
- def make_mappings(self):
- item_mapping = self.method.get_item_mapping(LABELSEP)
- items = [item for (item, _) in item_mapping if item]
- patterns = mkpatterns(items)
- return item_mapping, patterns
-
- def get_annotations(self, data):
- for utt_id in data.utt_id.unique():
- utt_rows = data[data.utt_id == utt_id]
- parsed_utterance = self.parse_utterance(utt_id, utt_rows)
- self.document.utterances.append(parsed_utterance)
-
- def parse_utterance(self, utt_id, utt_data):
- instance = SAFUtterance(utt_id)
- utt_object = self.transcript.get_utterance_by_id(utt_id)
- self.document.allutts[utt_object.utt_id] = utt_object.word_list
- for idx, wcol in enumerate(self.word_cols):
- relevant_cols = ['level', wcol]
- word = self.parse_word(utt_id, idx,
- wcol, utt_data[relevant_cols], utt_object.word_position_mapping)
- if word:
- instance.words.append(word)
-
- return instance
-
- def parse_word(self, utt_id, word_id, colname, word_data, wordposmap):
- data = word_data
- if colname != SAF_UNALIGNED_LEVEL.lower():
- # Don't drop data for unaligned
- data = word_data.dropna()
-
- try:
- utt_data = word_level_data(data, colname)
- text = utt_data.iloc[0]
-
- except UnalignedWord:
- text = ''
- except NoWordDataException:
- return None
-
- (begin, end) = wordposmap[word_id]['begin'], wordposmap[word_id]['end']
- instance = SAFWord(word_id, text, begin, end)
-
- word_levels = get_word_levels(data)
- for level in word_levels:
- item_data = data.loc[data.level == level, colname].iloc[0]
- if not pd.isnull(item_data):
- label = clean_item(item_data)
- enriched_label = enrich(label, PREFIX.lower())
- split_labels = getlabels(enriched_label, self.patterns)
-
- if not split_labels:
- self.errors.append((utt_id, word_id, text, level, label))
-
- self.map_labels(split_labels, instance,
- level, utt_id, word_id, text)
-
- # read comments
- comment_data = data.loc[data.level == SAF_COMMENT_LEVEL.lower()].dropna()
- if not comment_data.empty:
- instance.comment = str(comment_data[colname].iloc[0])
-
- return instance
-
- def map_labels(self, split_labels: List[str], saf_word: SAFWord, level: str, utt_id, word_id, text):
- for label in split_labels:
- mapped = item2queryid(label, level, self.item_mapping)
- if mapped:
- query_id, fase = mapped
- saf_word.annotations.append(SAFAnnotation(
- level, label, fase, query_id))
- self.document.exactresults[query_id].append(
- (utt_id, word_id))
-
- else:
- logger.warning(
- 'Cannot resolve query_id for (%s, %s)', level, label)
- self.errors.append(
- (utt_id, word_id, text, level, label))
diff --git a/backend/analysis/annotations/safreader_test.py b/backend/analysis/annotations/safreader_test.py
deleted file mode 100644
index 105d96bf..00000000
--- a/backend/analysis/annotations/safreader_test.py
+++ /dev/null
@@ -1,77 +0,0 @@
-import os.path as op
-from operator import itemgetter
-
-import pytest
-from analysis.annotations.safreader import SAFReader, get_word_levels
-from analysis.query.run import query_transcript
-from pandas import DataFrame
-from pytest_lazyfixture import lazy_fixture
-
-
-@pytest.mark.parametrize("method, transcript, filedir, samplenum", [
- (lazy_fixture("tarsp_method"), lazy_fixture("tarsp_transcript"),
- lazy_fixture("cha_testfiles_dir"), 5),
- (lazy_fixture("asta_method"), lazy_fixture("asta_transcript"),
- lazy_fixture("cha_testfiles_dir"), 16)
-]
-)
-def test_read_saf(method, transcript, filedir, samplenum):
- true_results, _ = query_transcript(transcript, method, annotate=True, zc_embed=method.category.zc_embeddings)
- assert not true_results.annotationinput
-
- reader = SAFReader(op.join(filedir, f'sample_{samplenum}_SAF.xlsx'), method, transcript)
- read_results = reader.document.to_allresults()
-
- # are the coreresults the same?
- assert sorted(read_results.coreresults.keys()) == sorted(true_results.coreresults.keys())
- for q, hits in read_results.coreresults.items():
- true_hits = true_results.coreresults[q]
- assert hits == true_hits
-
- # are all the annotations the same?
- assert true_results.annotations.keys() == reader.document.reformatted_annotations.keys()
- for q, annos in true_results.annotations.items():
- true_annos = reader.document.reformatted_annotations[q]
- for word in annos:
- true_word = next((w for w in true_annos if w.index == word.index), None)
- hits = sorted(word.hits, key=itemgetter('level', 'item'))
- if true_word:
- true_hits = sorted(true_word.hits, key=itemgetter('level', 'item'))
- assert hits == true_hits
- else:
- # if the true_word is not found (unaligned empty), make sure it didnt miss anything
- assert hits == []
-
- # are the exactresults the same?
- true_exact = {k: sorted(v) for (k, v) in true_results.exactresults.items() if v != []}
- read_exact = {k: sorted(v) for (k, v) in read_results.exactresults.items() if v != []}
- assert true_exact == read_exact
-
- # are the allutts the same?
- assert true_results.allutts == read_results.allutts
-
-
-def test_astalex(asta_method, asta_transcript, asta_transcript_corrections, cha_testfiles_dir):
- true_results, _ = query_transcript(asta_transcript, asta_method, annotate=True, zc_embed=False)
- assert true_results.annotationinput
-
- assert true_results.annotations.get(3)[7].hits == [{'level': 'Taalmaat', 'item': 'N', 'fase': 0}]
-
- assert True
-
-
-def test_wordlevels():
- data = {'level': map(str.lower, ['Utt', 'QA', 'SZ', 'Grammaticale Fout', 'Commentaar']),
- 'word1': [1, None, 'X', 'V, BvBB', 'Hier staat wat commentaar']}
- df_in = DataFrame.from_dict(data)
-
- word_levels = get_word_levels(df_in)
- assert word_levels == ['qa', 'sz', 'grammaticale fout']
-
-
-def test_read_saf_comments(tarsp_method, tarsp_transcript, cha_testfiles_dir):
- reader = SAFReader(op.join(cha_testfiles_dir, 'sample_5_SAF_with_comments.xlsx'), tarsp_method, tarsp_transcript)
- sent = reader.document.utterances[3]
- assert sent.words[1].comment == 'Ik vind hier iets van.'
- assert sent.words[2].comment == '1'
- assert sent.words[3].comment == 'En hier misschien ook wel iets van'
diff --git a/backend/analysis/annotations/utils.py b/backend/analysis/annotations/utils.py
deleted file mode 100644
index 5a2d7338..00000000
--- a/backend/analysis/annotations/utils.py
+++ /dev/null
@@ -1,105 +0,0 @@
-import logging
-import re
-from typing import List, Pattern, Tuple
-
-import pandas
-
-from .constants import (HEADER_VARIANTS, ITEMSEPPATTERN, LABELSEP,
- NO_CLEAN_LEVELS, TupleStrDict)
-
-logger = logging.getLogger('sasta')
-
-
-def standardize_header_name(header: str) -> str:
- '''lowercase and standardize header'''
- header = header.lower()
- for key, val in HEADER_VARIANTS.items():
- if header in val:
- return key
- return header
-
-
-def clean_row(row: pandas.Series) -> pandas.Series:
- if row.level.lower() in NO_CLEAN_LEVELS:
- row.level = row.level.lower()
- return row
- return row.apply(clean_cell)
-
-
-def clean_cell(cell):
- if isinstance(cell, str):
- result = cell
- result = result.lstrip()
- result = result.rstrip()
- result = result.lower()
- return result
- return cell
-
-
-def mkpatterns(allcodes: List[str]) -> Tuple[Pattern, Pattern]:
- basepattern = r''
- sortedallcodes = sorted(allcodes, key=len, reverse=True)
- adaptedcodes = [codeadapt(c) for c in sortedallcodes]
- basepattern = r'' + '|'.join(adaptedcodes) + '|' + ITEMSEPPATTERN
- fullpattern = r'^(' + basepattern + r')*$'
-
- return (re.compile(basepattern), re.compile(fullpattern))
-
-
-def codeadapt(code: str) -> str:
- result = code
- result = re.sub(r'\.', r'\\.', result)
- result = re.sub(r'\(', r'\\(', result)
- result = re.sub(r'\)', r'\\)', result)
- result = re.sub(r'\?', r'\\?', result)
- result = re.sub(r'\*', r'\\*', result)
- result = re.sub(r'\+', r'\\+', result)
- result = re.sub(r' ', r'\\s+', result)
-
- return result
-
-
-def enrich(labelstr: str, lcprefix: str) -> str:
- if not labelstr:
- return labelstr
- try:
- labels = labelstr.split(LABELSEP)
- newlabels = []
- for label in labels:
- if label != "" and lcprefix != "":
- newlabels.append(lcprefix + ":" + label)
- else:
- newlabels.append(label)
- result = LABELSEP.join(newlabels)
- return result
- except TypeError:
- logger.warning('non-str enrich: %s %s', labelstr, type(labelstr))
- return labelstr
-
-
-def getlabels(labelstr, patterns):
- results = []
- (pattern, fullpattern) = patterns
- if fullpattern.match(labelstr):
- matches = pattern.finditer(labelstr)
- results = [m.group(0) for m in matches if m.group(0) not in ' ;,-/']
- else:
- results = []
- matches = pattern.finditer(labelstr)
- logstr = str([m.group(0) for m in matches if m.group(0) not in ' ;,-'])
- logger.warning('Cannot interpret %s; found items: %s',
- labelstr, logstr)
- return results
-
-
-def clean_item(item: str):
- clean_item = item.lower().strip()
- clean_item = re.sub(pattern=r' +', repl=' ', string=clean_item)
- return clean_item
-
-
-def item2queryid(item: str, level: str,
- mapping: TupleStrDict):
- if (item, level) in mapping:
- return mapping[(item, level)]
- return None
diff --git a/backend/analysis/conftest.py b/backend/analysis/conftest.py
index 82fb8669..174a9b67 100644
--- a/backend/analysis/conftest.py
+++ b/backend/analysis/conftest.py
@@ -15,65 +15,6 @@
CORRECTIONS_ASTA_16 = '{"Pause": [["15", "Pause", "[\'(..)\']", "CHAT", "None", "None", "uh (..) BEROEP1", "BEROEP1"], ["33", "Pause", "[\'(.)\']", "CHAT", "None", "None", "(.) ja (.) ja ik weet het niet", "ik weet het niet"], ["33", "Pause", "[\'(.)\']", "CHAT", "None", "None", "(.) ja (.) ja ik weet het niet", "ik weet het niet"], ["38", "Pause", "[\'(..)\']", "CHAT", "None", "None", "dat heb ik net nog gelezen (..)", null], ["45", "Pause", "[\'(..)\']", "CHAT", "None", "None", "oo (..) uh ja uh uh (..) ik zing met uh oudere mensen uh", "ik zing met oudere mensen"], ["45", "Pause", "[\'(..)\']", "CHAT", "None", "None", "oo (..) uh ja uh uh (..) ik zing met uh oudere mensen uh", "ik zing met oudere mensen"], ["46", "Pause", "[\'(..)\']", "CHAT", "None", "None", "ik uh (..) uh ik doe uh boekjes voor uh club geloof ik uh", "ik doe boekjes voor club geloof ik"], ["47", "Pause", "[\'(...)\']", "CHAT", "None", "None", "en uh (...) uh (.) uh sorry", "en"], ["47", "Pause", "[\'(.)\']", "CHAT", "None", "None", "en uh (...) uh (.) uh sorry", "en"]], "parsed_as": [["1", "parsed_as", "ik vind het beetje moeilijk om het goed te vertellen want ik heb een ongeluk gehad", "SASTA", "Correction", "None", "ja uh ik vind het beetje moeilijk om het goed te vertellen want ik heb een ongeluk gehad ", "ik vind het beetje moeilijk om het goed te vertellen want ik heb een ongeluk gehad"], ["4", "parsed_as", "en nu krijg ik te horen", "SASTA", "Correction", "None", "en uh nu krijg ik te horen", "en nu krijg ik te horen"], ["6", "parsed_as", "en verder het gaat redelijk denk ik", "SASTA", "Correction", "None", "en verder ja het gaat redelijk denk ik", "en verder het gaat redelijk denk ik"], ["7", "parsed_as", "ik ben eerst naar een ziekenhuis geweest een aantal weken", "SASTA", "Correction", "None", "oh ja sorry ja ik ben eerst uh naar een ziekenhuis geweest een aantal weken", "ik ben eerst naar een ziekenhuis geweest een aantal weken"], ["8", "parsed_as", "toen een aantal weken in een iets van zorg", "SASTA", "Correction", "None", "toen een aantal weken in een een iets van zorg ", "toen een aantal weken in een iets van zorg"], ["9", "parsed_as", "ik weet niet in uh( . )", "SASTA", "Correction", "None", "ik weet niet uh in uh(.)", "ik weet niet in uh( . )"], ["10", "parsed_as", "buiten is Breda", "SASTA", "Correction", "None", "uh buiten Breda ", "buiten is Breda"], ["13", "parsed_as", "en toen ik zo ver weer was ben ik naar hier gekomen", "SASTA", "Correction", "None", "en en toen ik zo ver weer was ben ik naar hier gekomen", "en toen ik zo ver weer was ben ik naar hier gekomen"], ["14", "parsed_as", "dat heet de ZORGINSTELLING1", "SASTA", "Correction", "None", "dat heet de uh ZORGINSTELLING1", "dat heet de ZORGINSTELLING1"], ["15", "parsed_as", "BEROEP1", "SASTA", "Correction", "None", "uh (..) BEROEP1", "BEROEP1"], ["16", "parsed_as", "is heel erg leuk", "SASTA", "Correction", "None", "ja is heel erg leuk ja", "is heel erg leuk"], ["20", "parsed_as", "en is ook leuk kon kinderen zo gezellig zo lief zo fijn", "SASTA", "Correction", "None", "en is ook leuk kon kinderen zo gezellig zo lief zo fijn ja", "en is ook leuk kon kinderen zo gezellig zo lief zo fijn"], ["21", "parsed_as", "en ook ouders heel goed contact", "SASTA", "Correction", "None", "ja en ook ouders heel goed contact", "en ook ouders heel goed contact"], ["22", "parsed_as", "kinderen worden gebracht", "SASTA", "Correction", "None", "ki kinderen worden gebracht", "kinderen worden gebracht"], ["23", "parsed_as", "en dan is het contact goed met de ouders", "SASTA", "Correction", "None", "en uh dan is het contact goed met de ouders", "en dan is het contact goed met de ouders"], ["25", "parsed_as", "en kinderen zijn fijn lief", "SASTA", "Correction", "None", "en kinderen zijn ja fijn lief", "en kinderen zijn fijn lief"], ["26", "parsed_as", "en voelen zich wel gelukkig bij ons", "SASTA", "Correction", "None", "en voelen zich wel uh voelen zich wel gelukkig bij ons", "en voelen zich wel gelukkig bij ons"], ["28", "parsed_as", "ik werk drie dagen", "SASTA", "Correction", "None", "oo uh uh ik werk drie dagen", "ik werk drie dagen"], ["29", "parsed_as", "ik begin ik om half acht tot ik denk tot zes uur", "SASTA", "Correction", "None", "ja ik begin ik om uh half acht tot ik denk tot zes uur ja", "ik begin ik om half acht tot ik denk tot zes uur"], ["30", "parsed_as", "toevallig hierachter", "SASTA", "Correction", "None", "ja toevallig hierachter ", "toevallig hierachter"], ["31", "parsed_as", "kinderen met beperking", "SASTA", "Correction", "None", "uh kinderen met beperking", "kinderen met beperking"], ["33", "parsed_as", "ik weet het niet", "SASTA", "Correction", "None", "(.) ja (.) ja ik weet het niet", "ik weet het niet"], ["35", "parsed_as", "ik herken het", "SASTA", "Correction", "None", "ik ik herken het", "ik herken het"], ["36", "parsed_as", "daar ben ik veel geweest", "SASTA", "Correction", "None", "wauw daar ben ik veel geweest", "daar ben ik veel geweest"], ["37", "parsed_as", "ook hier de", "SASTA", "Correction", "None", "ook hier de uh", "ook hier de"], ["40", "parsed_as", "veel geweest voor", "SASTA", "Correction", "None", "veel geweest voor uh", "veel geweest voor"], ["42", "parsed_as", "net gekoppeld", "SASTA", "Correction", "None", "ja uh net gekoppeld", "net gekoppeld"], ["44", "parsed_as", "dus maar ben ik veel geweest maar ook als ik hier", "SASTA", "Correction", "None", "dus maar ben ik veel geweest maar ook als ik hier uh", "dus maar ben ik veel geweest maar ook als ik hier"], ["45", "parsed_as", "ik zing met oudere mensen", "SASTA", "Correction", "None", "oo (..) uh ja uh uh (..) ik zing met uh oudere mensen uh", "ik zing met oudere mensen"], ["46", "parsed_as", "ik doe boekjes voor club geloof ik", "SASTA", "Correction", "None", "ik uh (..) uh ik doe uh boekjes voor uh club geloof ik uh", "ik doe boekjes voor club geloof ik"], ["47", "parsed_as", "en", "SASTA", "Correction", "None", "en uh (...) uh (.) uh sorry", "en"]], "ExtraGrammatical": [["1", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "ja uh ik vind het beetje moeilijk om het goed te vertellen want ik heb een ongeluk gehad ", "ik vind het beetje moeilijk om het goed te vertellen want ik heb een ongeluk gehad"], ["1", "ExtraGrammatical", "ja, nee or nou filled pause", "SASTA", "Syntax", "None", "ja uh ik vind het beetje moeilijk om het goed te vertellen want ik heb een ongeluk gehad ", "ik vind het beetje moeilijk om het goed te vertellen want ik heb een ongeluk gehad"], ["4", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "en uh nu krijg ik te horen", "en nu krijg ik te horen"], ["6", "ExtraGrammatical", "ja, nee or nou filled pause", "SASTA", "Syntax", "None", "en verder ja het gaat redelijk denk ik", "en verder het gaat redelijk denk ik"], ["7", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "oh ja sorry ja ik ben eerst uh naar een ziekenhuis geweest een aantal weken", "ik ben eerst naar een ziekenhuis geweest een aantal weken"], ["7", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "oh ja sorry ja ik ben eerst uh naar een ziekenhuis geweest een aantal weken", "ik ben eerst naar een ziekenhuis geweest een aantal weken"], ["7", "ExtraGrammatical", "Interjection", "SASTA", "Syntax", "None", "oh ja sorry ja ik ben eerst uh naar een ziekenhuis geweest een aantal weken", "ik ben eerst naar een ziekenhuis geweest een aantal weken"], ["7", "ExtraGrammatical", "Repeated ja, nee, nou", "SASTA", "Syntax", "Repetition", "oh ja sorry ja ik ben eerst uh naar een ziekenhuis geweest een aantal weken", "ik ben eerst naar een ziekenhuis geweest een aantal weken"], ["7", "ExtraGrammatical", "ja, nee or nou filled pause", "SASTA", "Syntax", "None", "oh ja sorry ja ik ben eerst uh naar een ziekenhuis geweest een aantal weken", "ik ben eerst naar een ziekenhuis geweest een aantal weken"], ["8", "ExtraGrammatical", "Repeated word token", "SASTA", "Tokenisation", "Repetition", "toen een aantal weken in een een iets van zorg ", "toen een aantal weken in een iets van zorg"], ["9", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "ik weet niet uh in uh(.)", "ik weet niet in uh( . )"], ["10", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "uh buiten Breda ", "buiten is Breda"], ["13", "ExtraGrammatical", "Repeated word token", "SASTA", "Tokenisation", "Repetition", "en en toen ik zo ver weer was ben ik naar hier gekomen", "en toen ik zo ver weer was ben ik naar hier gekomen"], ["14", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "dat heet de uh ZORGINSTELLING1", "dat heet de ZORGINSTELLING1"], ["15", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "uh (..) BEROEP1", "BEROEP1"], ["16", "ExtraGrammatical", "ja, nee or nou filled pause", "SASTA", "Syntax", "None", "ja is heel erg leuk ja", "is heel erg leuk"], ["16", "ExtraGrammatical", "ja, nee or nou filled pause", "SASTA", "Syntax", "None", "ja is heel erg leuk ja", "is heel erg leuk"], ["20", "ExtraGrammatical", "ja, nee or nou filled pause", "SASTA", "Syntax", "None", "en is ook leuk kon kinderen zo gezellig zo lief zo fijn ja", "en is ook leuk kon kinderen zo gezellig zo lief zo fijn"], ["21", "ExtraGrammatical", "ja, nee or nou filled pause", "SASTA", "Syntax", "None", "ja en ook ouders heel goed contact", "en ook ouders heel goed contact"], ["22", "ExtraGrammatical", "Short Repetition", "SASTA", "Tokenisation", "Repetition", "ki kinderen worden gebracht", "kinderen worden gebracht"], ["23", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "en uh dan is het contact goed met de ouders", "en dan is het contact goed met de ouders"], ["25", "ExtraGrammatical", "ja, nee or nou filled pause", "SASTA", "Syntax", "None", "en kinderen zijn ja fijn lief", "en kinderen zijn fijn lief"], ["26", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "en voelen zich wel uh voelen zich wel gelukkig bij ons", "en voelen zich wel gelukkig bij ons"], ["26", "ExtraGrammatical", "Word token of a repeated word token sequence", "SASTA", "Tokenisation", "Repetition", "en voelen zich wel uh voelen zich wel gelukkig bij ons", "en voelen zich wel gelukkig bij ons"], ["26", "ExtraGrammatical", "Word token of a repeated word token sequence", "SASTA", "Tokenisation", "Repetition", "en voelen zich wel uh voelen zich wel gelukkig bij ons", "en voelen zich wel gelukkig bij ons"], ["26", "ExtraGrammatical", "Word token of a repeated word token sequence", "SASTA", "Tokenisation", "Repetition", "en voelen zich wel uh voelen zich wel gelukkig bij ons", "en voelen zich wel gelukkig bij ons"], ["28", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "oo uh uh ik werk drie dagen", "ik werk drie dagen"], ["28", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "oo uh uh ik werk drie dagen", "ik werk drie dagen"], ["28", "ExtraGrammatical", "Interjection", "SASTA", "Syntax", "None", "oo uh uh ik werk drie dagen", "ik werk drie dagen"], ["29", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "ja ik begin ik om uh half acht tot ik denk tot zes uur ja", "ik begin ik om half acht tot ik denk tot zes uur"], ["29", "ExtraGrammatical", "ja, nee or nou filled pause", "SASTA", "Syntax", "None", "ja ik begin ik om uh half acht tot ik denk tot zes uur ja", "ik begin ik om half acht tot ik denk tot zes uur"], ["29", "ExtraGrammatical", "ja, nee or nou filled pause", "SASTA", "Syntax", "None", "ja ik begin ik om uh half acht tot ik denk tot zes uur ja", "ik begin ik om half acht tot ik denk tot zes uur"], ["30", "ExtraGrammatical", "ja, nee or nou filled pause", "SASTA", "Syntax", "None", "ja toevallig hierachter ", "toevallig hierachter"], ["31", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "uh kinderen met beperking", "kinderen met beperking"], ["33", "ExtraGrammatical", "Repeated ja, nee, nou", "SASTA", "Syntax", "Repetition", "(.) ja (.) ja ik weet het niet", "ik weet het niet"], ["33", "ExtraGrammatical", "ja, nee or nou filled pause", "SASTA", "Syntax", "None", "(.) ja (.) ja ik weet het niet", "ik weet het niet"], ["35", "ExtraGrammatical", "Repeated word token", "SASTA", "Tokenisation", "Repetition", "ik ik herken het", "ik herken het"], ["36", "ExtraGrammatical", "Interjection", "SASTA", "Syntax", "None", "wauw daar ben ik veel geweest", "daar ben ik veel geweest"], ["37", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "ook hier de uh", "ook hier de"], ["40", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "veel geweest voor uh", "veel geweest voor"], ["42", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "ja uh net gekoppeld", "net gekoppeld"], ["42", "ExtraGrammatical", "ja, nee or nou filled pause", "SASTA", "Syntax", "None", "ja uh net gekoppeld", "net gekoppeld"], ["44", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "dus maar ben ik veel geweest maar ook als ik hier uh", "dus maar ben ik veel geweest maar ook als ik hier"], ["45", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "oo (..) uh ja uh uh (..) ik zing met uh oudere mensen uh", "ik zing met oudere mensen"], ["45", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "oo (..) uh ja uh uh (..) ik zing met uh oudere mensen uh", "ik zing met oudere mensen"], ["45", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "oo (..) uh ja uh uh (..) ik zing met uh oudere mensen uh", "ik zing met oudere mensen"], ["45", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "oo (..) uh ja uh uh (..) ik zing met uh oudere mensen uh", "ik zing met oudere mensen"], ["45", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "oo (..) uh ja uh uh (..) ik zing met uh oudere mensen uh", "ik zing met oudere mensen"], ["45", "ExtraGrammatical", "Interjection", "SASTA", "Syntax", "None", "oo (..) uh ja uh uh (..) ik zing met uh oudere mensen uh", "ik zing met oudere mensen"], ["45", "ExtraGrammatical", "ja, nee or nou filled pause", "SASTA", "Syntax", "None", "oo (..) uh ja uh uh (..) ik zing met uh oudere mensen uh", "ik zing met oudere mensen"], ["46", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "ik uh (..) uh ik doe uh boekjes voor uh club geloof ik uh", "ik doe boekjes voor club geloof ik"], ["46", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "ik uh (..) uh ik doe uh boekjes voor uh club geloof ik uh", "ik doe boekjes voor club geloof ik"], ["46", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "ik uh (..) uh ik doe uh boekjes voor uh club geloof ik uh", "ik doe boekjes voor club geloof ik"], ["46", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "ik uh (..) uh ik doe uh boekjes voor uh club geloof ik uh", "ik doe boekjes voor club geloof ik"], ["46", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "ik uh (..) uh ik doe uh boekjes voor uh club geloof ik uh", "ik doe boekjes voor club geloof ik"], ["46", "ExtraGrammatical", "Repeated word token", "SASTA", "Tokenisation", "Repetition", "ik uh (..) uh ik doe uh boekjes voor uh club geloof ik uh", "ik doe boekjes voor club geloof ik"], ["47", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "en uh (...) uh (.) uh sorry", "en"], ["47", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "en uh (...) uh (.) uh sorry", "en"], ["47", "ExtraGrammatical", "Filled Pause", "SASTA", "Syntax", "None", "en uh (...) uh (.) uh sorry", "en"], ["47", "ExtraGrammatical", "Interjection", "SASTA", "Syntax", "None", "en uh (...) uh (.) uh sorry", "en"]]}'
-@pytest.fixture
-def cha_testfiles_dir():
- return op.join(settings.BASE_DIR, 'test_files')
-
-
-@pytest.fixture
-def tarsp_category(db):
- obj = MethodCategory.objects.create(name='TARSP', zc_embeddings=True, levels=['Sz', 'Zc', 'Wg', 'VVW'], marking_postcodes=['[+ G]'])
- yield obj
- obj.delete()
-
-
-@pytest.fixture
-def stap_category(db):
- obj = MethodCategory.objects.create(name='STAP', zc_embeddings=False, levels=['Complexiteit', 'Grammaticale fout'], marking_postcodes=['[+ G]', '[+ VU]'])
- yield obj
- obj.delete()
-
-
-@pytest.fixture
-def asta_category(db):
- obj = MethodCategory.objects.create(name='ASTA', zc_embeddings=False, levels=[
- "Samplegrootte",
- "MLU",
- "Taalmaat",
- "Foutenanalyse",
- "Lemma"
- ], marking_postcodes=["[+ G]"])
- yield obj
- obj.delete()
-
-
-@pytest.fixture
-def method_dir():
- return op.join(sd_settings.SD_DIR, 'data', 'methods')
-
-
-@pytest.fixture
-def tarsp_method(db, tarsp_category, method_dir):
- file = glob.glob(f'{method_dir}/TARSP Index Current.xlsx')[0]
- with open(file, 'rb') as f:
- wrapped_file = File(f)
- instance = AssessmentMethod(name='tarsp_test_method', category=tarsp_category)
- instance.content.save(op.basename(file), wrapped_file)
- yield instance
- instance.delete()
-
-
-@pytest.fixture
-def asta_method(db, asta_category, method_dir):
- file = glob.glob(f'{method_dir}/ASTA_Index_Current.xlsx')[0]
- with open(file, 'rb') as f:
- wrapped_file = File(f)
- instance = AssessmentMethod(name='asta_test_method', category=asta_category)
- instance.content.save(op.basename(file), wrapped_file)
- yield instance
- instance.delete()
-
-
@pytest.fixture
def tarsp_corpus(db, admin_user, tarsp_method, tarsp_category):
obj = Corpus.objects.create(
diff --git a/backend/analysis/convert/replacements.py b/backend/analysis/convert/replacements.py
index 93394d0c..ab68cbbc 100644
--- a/backend/analysis/convert/replacements.py
+++ b/backend/analysis/convert/replacements.py
@@ -2,11 +2,11 @@
from string import ascii_lowercase
import os.path as op
import json
-from django.conf import settings
+from sastadev.conf import settings as sdsettings
def instantiate_anonymizations():
- json_path = op.join(settings.BASE_DIR, 'anonymization.json')
+ json_path = op.join(sdsettings.SD_DIR, 'data', 'anonymization.json')
with open(json_path, 'r') as f:
return json.load(f)
@@ -35,7 +35,8 @@ def fill_name(string):
def repl(match):
raw_index = match.group(3) or '0'
- index = int(raw_index) if raw_index.isnumeric() else letter_index(raw_index)
+ index = int(raw_index) if raw_index.isnumeric(
+ ) else letter_index(raw_index)
repl = specs['common'][index]
return match.group(1) + repl + match.group(4)
diff --git a/backend/analysis/convert/tests/conftest.py b/backend/analysis/convert/tests/conftest.py
index 8ffd27c3..dc29e31b 100644
--- a/backend/analysis/convert/tests/conftest.py
+++ b/backend/analysis/convert/tests/conftest.py
@@ -131,4 +131,9 @@ def example_utterances():
'exp_text': 'Ik heet Jan en hij heet Anna.',
'exp_tiers': {'xano': '8|NAAM1|Jan, 24|NAAM2|Anna'},
},
+ {
+ 'text': 'Ik zit op de SCHOOL1 en hij op de SCHOOL2.',
+ 'exp_text': 'Ik zit op de Mariaschool en hij op de Calvijnschool.',
+ 'exp_tiers': {'xano': '13|SCHOOL1|Mariaschool, 38|SCHOOL2|Calvijnschool'},
+ },
]
diff --git a/backend/analysis/macros/__init__.py b/backend/analysis/macros/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/backend/analysis/macros/functions.py b/backend/analysis/macros/functions.py
deleted file mode 100644
index 6e110fbe..00000000
--- a/backend/analysis/macros/functions.py
+++ /dev/null
@@ -1,76 +0,0 @@
-import re
-import os.path as op
-import logging
-logger = logging.getLogger('sasta')
-
-idpat = r'([A-z_][A-z0-9_]*)'
-eqpat = r'='
-exprpat = r'"""(.*?)"""'
-whitespaces = r'\s+'
-
-macrocallpat = r'(%.+?%)'
-macrocallre = re.compile(macrocallpat)
-
-macropat = idpat + whitespaces + eqpat + whitespaces + exprpat
-
-macrore = re.compile(macropat, re.S)
-
-MACROFILENAMES = ['sastamacros1.txt', 'sastamacros2.txt']
-
-
-def macrostrs2dict(teststrings):
- macrodict = {}
- for tstr in teststrings:
- macromatches = macrore.finditer(tstr)
- for macromatch in macromatches:
- macroname = macromatch.group(1)
- macroexpr = macromatch.group(2)
- macrodict[macroname] = macroexpr
-
- return macrodict
-
-
-def readmacros(macrofile, macrodict):
- macrotext = macrofile.read()
- macromatches = macrore.finditer(macrotext)
- for macromatch in macromatches:
- macroname = macromatch.group(1)
- macroexpr = macromatch.group(2)
- if macroname in macrodict:
- logger.warning(
- 'Duplicate macro {} encountered. Ignored'.format(macroname))
- else:
- macrodict[macroname] = macroexpr
- return macrodict
-
-
-def expandmacros(expr, macrodict):
- result = expandmacrosdict(expr, macrodict)
- return result
-
-
-def expandmacrosdict(expr, macrodict):
- newexpr = expr
- thematch = macrocallre.search(newexpr)
- while thematch:
- macrocall = thematch.group(1)
- macroname = macrocall[1:-1]
- if macroname in macrodict:
- newexpr = macrocallre.sub(macrodict[macroname], newexpr)
- thematch = macrocallre.search(newexpr)
- else:
- logger.error(
- 'Unknown macro call encountered: {}.'.format(macroname))
- break
- return newexpr
-
-
-def get_macros_dict(macrofilenames=MACROFILENAMES):
- macrodict = {}
- for macrofilename in macrofilenames:
- script_dir = op.dirname(op.abspath(__file__))
- file_path = op.join(script_dir, macrofilename)
- macrofile = open(file_path, 'r', encoding='utf8')
- macrodict = readmacros(macrofile, macrodict)
- macrofile.close()
- return macrodict
diff --git a/backend/analysis/macros/sastamacros1.txt b/backend/analysis/macros/sastamacros1.txt
deleted file mode 100644
index 864143c6..00000000
--- a/backend/analysis/macros/sastamacros1.txt
+++ /dev/null
@@ -1,121 +0,0 @@
-Tarsp_VCr1 = """node[@rel="obj1" or @rel="pc" or @rel="predc" or @rel="ld" ] """
-
-
-b = """number(@begin)"""
-e = """number(@end)"""
-single_name = """( @ntype = 'eigen' or @postag='SPEC(deeleigen)' )"""
-
-multi_name = """( @cat='mwu' and node[@rel='mwp' and %single_name%] ) """
-
-name = """( %single_name% or %multi_name% )"""
-
-name_phrase= """( %name% or node[@rel="hd" and %name%] )"""
-
-booster = """(@lemma="allemachtig" or @lemma="beestachtig" or @lemma="bijzonder" or @lemma="bliksems" or @lemma="bloedig" or @lemma="bovenmate" or @lemma="buitengewoon" or @lemma="buitenmate" or @lemma="buitensporig" or @lemma="crimineel" or @lemma="deerlijk" or @lemma="deksels" or @lemma="donders" or @lemma="drommels" or @lemma="eindeloos" or @lemma="enorm" or @lemma="erbarmelijk" or @lemma="fantastisch" or @lemma="formidabel" or @lemma="geweldig" or @lemma="goddeloos" or @lemma="godsjammerlijk" or @lemma="grenzeloos" or @lemma="grotelijks" or @lemma="heel" or @lemma="ijselijk" or @lemma="ijzig" or @lemma="intens" or @lemma="krankzinnig" or @lemma="machtig" or @lemma="mirakels" or @lemma="monsterachtig" or @lemma="moorddadig" or @lemma="oneindig" or @lemma="onnoemelijk" or @lemma="ontiegelijk" or @lemma="ontstellend" or @lemma="ontzaglijk" or @lemma="ontzettend" or @lemma="onuitsprekelijk" or @lemma="onvoorstelbaar" or @lemma="onwezenlijk" or @lemma="onwijs" or @lemma="overweldigend" or @lemma="peilloos" or @lemma="reusachtig" or @lemma="reuze" or @lemma="schrikkelijk" or @lemma="sterk" or @lemma="uiterst" or @lemma="verdomd" or @lemma="verdraaid" or @lemma="verduiveld" or @lemma="verduveld" or @lemma="verrekt" or @lemma="verrot" or @lemma="verschrikkelijk" or @lemma="vervloekt" or @lemma="vreselijk" or @lemma="waanzinnig" or @lemma="zeer" or @lemma="zeldzaam" or @lemma="zwaar" )"""
-
-JO_v3 = """
- count(.//node[@pt="ww"])>=3 and
- (some $v1 in .//node[@pt="ww"],
- $v2 in .//node[@pt="ww"],
- $v3 in .//node[@pt="ww"]
- satisfies ($v1/%PQ_e% < $v2/%PQ_e% and
- $v2/%PQ_e% < $v3/%PQ_e% and
- $v1/%PQ_e% < $v3/%PQ_e%
- )
- )"""
-
-JO_kijken_naar = """ parent::node[@cat="pp" and
- node[@lemma="naar" and @rel= "hd"] and
- parent::node[ node[@pt= "ww" and
- @rel= "hd" and
- @lemma= "kijken"
- ]
- ]
- ]
-"""
-
-Tarsp_hww = """
- (@lemma="kunnen" or
- @lemma = "moeten" or
- @lemma= "hoeven" or
- @lemma = "blijven" or
- @lemma = "willen" or
- @lemma = "zullen" or
- @lemma = "doen" or
- @lemma = "gaan" or
- @lemma = "horen" or
- @lemma = "komen" or
- @lemma = "laten" or
- @lemma = "liggen" or
- @lemma = "lopen" or
- @lemma = "mogen" or
- @lemma = "staan" or
- @lemma = "zitten ")
- """
-
-Tarsp_OndWBVC = """
-(@cat="smain" or @cat="ssub") and node[@rel="su"] and node[@rel="hd" and @pt="ww"] and
- ((count(node[@rel!="svp"]) = 4 and node[@rel="mod"] and %Tarsp_VCr1% ) or
- (count(node) = 3 and node[node[@rel="mod"] and %Tarsp_VCr1% ])
- )
-"""
-
-Tarsp_OndWBB = """
-(@cat="smain" or @cat="ssub") and node[@rel="su"] and node[@rel="hd" and @pt="ww"] and
- ((count(node[@rel!="svp"]) = 4 and count(node[@rel="mod" or @rel="ld"]) = 2 ) or
- (count(node) = 3 and node[@rel="vc" and (@cat="inf" or @cat="ppart") and count(node[@rel="mod" or @rel="ld"]) =2 ])
- )
-"""
-
-ASTA_pred = """(@rel="predc" or @rel="predm" or (@rel="hd" and parent::node[@rel="predc" or @rel="predm"]))"""
-
-ASTA_attr = """((@rel="mod" and parent::node[node[@rel="hd" and (@pt="n" or @pt="vnw")]] ) or
- (@rel="hd" and parent::node[@rel="mod" and parent::node[node[@rel="hd" and (@pt="n" or @pt="vnw")]]]))"""
-
-
-ASTA_adverbial = """
- ((@rel="mod" and parent::node[node[@rel="hd" and not(@pt="n" or @pt="vnw")]] ) or
- (@rel="hd" and parent::node[@rel="mod" and parent::node[node[@rel="hd" and not(@pt="n" or @pt="vnw")]]]))
- """
-
-
-ASTA_modalww = """ (@lemma="zullen" or @lemma="willen" or @lemma="moeten" or @lemma="mogen" or @lemma="kunnen") """
-
-
-ASTA_kopww = """ (@pt="ww" and @rel="hd" and @lemma!="uit_zien" and @lemma!="heten" and @lemma!="gaan" and @lemma!="zitten" and parent::node[node[@rel="predc"] and not(node[@rel="obj1"])] )"""
-
-ASTA_wantmaarbijzin = """
-((@cat="smain" or (@cat="du" and node[@cat="smain" and @rel="nucl"])) and
- ../node[(@word="want" or @word="maar") and @rel="crd" and @pt="vg"] and
- @begin>=../node[(@word="want" or @word="maar")]/@end)
-"""
-
-ASTA_dusbijzin = """
-(@cat="smain" and node[@lemma="dus" and @begin=parent::node/@begin and @pt="bw" and @rel="mod"])
-"""
-
-ASTA_detadjs = """
- (@pt="vnw" and (@rel="mod" or @rel="det") and @vwtype="onbep" and parent::node[@cat="np"] and
-(@lemma="al" or @lemma="beide" or @lemma="elk" or @lemma="enig" or @lemma="enkel" or @lemma="geen" or @lemma="ieder" or
- @lemma="meer" or @lemma="meerdere" or @lemma="meest" or @lemma="menig" or @lemma="minder" or
- @lemma="minst" or @lemma="sommige" or @lemma="veel" or @lemma="weinig"))
-
-"""
-
-Tarsp_kijkeens = """
-(@cat="sv1" and @rel != "--" and
- node[@pt="ww" and @lemma="kijken" and @rel="hd" and @pvagr="ev" and @pvtijd="tgw" ] and
- node[@lemma="eens" and @rel="mod" and @pt="bw"] and count(node)=2)
-"""
-
-ASTA_numeral = """
-(@lemma="twee" or @lemma="drie" or @lemma="vier" or @lemma="vijf" or @lemma="zes" or @lemma="zeven" or @lemma="acht" or @lemma="negen" or @lemma="tien" or @lemma="elf" or @lemma="twaalf" or @lemma="dertien" or @lemma="veertien" or @lemma="vijftien" or @lemma="zestien" or @lemma="zeventien" or @lemma="achttien" or @lemma="negentien" or @lemma="twintig" or @lemma="eentje" or @lemma="tweetjes" or @lemma="drietjes" or @lemma="viertjes" or @lemma="vijfjes" or @lemma="zesjes")
-"""
-
-ASTA_filled_pause = """
-(@lemma= "uh" or @lemma ="Uh" or @lemma="Uhm" or @lemma= "uhm" or @lemma = "euh"or @lemma = "eh" or @lemma = "goh" or @word="xxx" or @word="XXX")
-"""
-
-
-
-
\ No newline at end of file
diff --git a/backend/analysis/macros/sastamacros2.txt b/backend/analysis/macros/sastamacros2.txt
deleted file mode 100644
index 26b2f5d9..00000000
--- a/backend/analysis/macros/sastamacros2.txt
+++ /dev/null
@@ -1,63 +0,0 @@
-STAP_geen_BB = """
-not(@lemma="al"
- or @lemma="dan"
- or @lemma="dus"
- or @lemma="eens"
- or @lemma="gewoon"
- or @lemma="meer"
- or @lemma="niet"
- or @lemma="nog"
- or @lemma="nou"
- or @lemma="nu"
- or @lemma="ook"
- or @lemma="toch"
- or @lemma="toen"
- or @lemma="weer"
- or @lemma="wel"
- or @lemma="zo")
-"""
-
-
-STAP_BB_t = """
-not((((@frame="tmp_adverb"
- or @frame="adjective(both(tmpadv))"
- and not(parent::node[@rel="mod"]))
- or ((@cat="pp"
- or @rel="mod")
- and (node[@special="tmp"
- or node[@special="tmp"]]))))
- or (@cat="pp" and node[@pt="vz"
- and (@lemma="sinds" or @lemma="gedurende"
- or @lemma="na")])
- or (@cat="np" and @rel="mod"
- and node[@lemma="elk" or @rel="det"]
- and node[@special="tmp"])
- or (@rel="mod" and
- (@lemma="net" or @lemma="gauw"
- or @lemma="vroeger" or @lemma="toen"
- or @lemma="soms" or @lemma="altijd")))
-"""
-
-STAP_BB_p = """
-not(((@cat="pp"
- and @rel="ld")
- or @frame="waar_adverb(naar)"
- or @frame="waar_adverb(in)"
- or @frame="waar_adverb(heen)"
- or @frame="er_wh_loc_adverb"
- or @frame="wh_loc_adverb"
- or @frame="er_vp_adverb"
- or @frame="er_adverb(uit)"
- or @frame="er_loc_adverb"
- or @frame="loc_adverb"
- or (@cat="pp"
- and node[@pt="vz"
- and (@lemma="op"
- or @lemma="bij"
- or @lemma="in")]))
- or @cat="pp" and node[@pt="vz"
- and (@lemma="op" or @lemma="boven"
- or @lemma="onder" or @lemma="tussen"
- or @lemma="naast" or @lemma="achter"
- or @lemma="bij" or @lemma="naar")])
-"""
diff --git a/backend/analysis/macros/tests.py b/backend/analysis/macros/tests.py
deleted file mode 100644
index 5535b9c2..00000000
--- a/backend/analysis/macros/tests.py
+++ /dev/null
@@ -1,30 +0,0 @@
-import pytest
-from .functions import expandmacros, get_macros_dict
-import os.path as op
-
-# flake8: noqa: E501
-
-TESTSTRINGS = ['b = """number(@begin)"""', 'e = """number(@end)"""',
- 'single_name = """( @ntype = "eigen" or @postag="SPEC(deeleigen)" )"""',
- 'multi_name = """( @cat=''mwu'' and node[@rel=''mwp'' and %single_name%] ) """',
- 'name = """( %single_name% or %multi_name% )"""',
- 'name_phrase= """( %name% or node[@rel="hd" and %name%] )"""']
-TESTQUERIES = [('//node[%b%="3"]', '//node[number(@begin)="3"]'),
- ('//node[%single_name%]',
- "//node[( @ntype = 'eigen' or @postag='SPEC(deeleigen)' )]"),
- ("//node[%multi_name%]",
- "//node[( @cat='mwu' and node[@rel='mwp' and ( @ntype = 'eigen' or @postag='SPEC(deeleigen)' )] ) ]"),
- ("//node[%fout%]", "//node[%fout%]")]
-
-MACROFILENAMES = ['sastamacros1.txt',
- 'sastamacros2.txt']
-HERE = op.dirname(op.abspath(__file__))
-MACROFILENAMES = [op.join(HERE, fn) for fn in MACROFILENAMES]
-
-
-@pytest.mark.parametrize('short, long', TESTQUERIES)
-def test_macro_expansion(short, long):
- macrodict = get_macros_dict(MACROFILENAMES)
- for (short, long) in TESTQUERIES:
- expansion = expandmacros(short, macrodict)
- assert expansion == long
diff --git a/backend/analysis/migrations/0033_assessmentquery_literal.py b/backend/analysis/migrations/0033_assessmentquery_literal.py
new file mode 100644
index 00000000..336e1992
--- /dev/null
+++ b/backend/analysis/migrations/0033_assessmentquery_literal.py
@@ -0,0 +1,18 @@
+# Generated by Django 3.1.14 on 2023-11-15 15:46
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('analysis', '0032_method_subj_adapt'),
+ ]
+
+ operations = [
+ migrations.AddField(
+ model_name='assessmentquery',
+ name='literal',
+ field=models.CharField(blank=True, default='', max_length=200),
+ ),
+ ]
diff --git a/backend/analysis/models.py b/backend/analysis/models.py
index 1d4c06a6..6cc29e23 100644
--- a/backend/analysis/models.py
+++ b/backend/analysis/models.py
@@ -4,17 +4,18 @@
import zipfile
from io import BytesIO
from itertools import chain
-from typing import List, Optional, Tuple
+from typing import Dict, List, Tuple
from uuid import uuid4
-from analysis.annotations.utils import clean_item
from analysis.managers import SastaQueryManager
from django.contrib.auth.models import User
from django.contrib.postgres.fields import ArrayField
from django.db import models
from lxml import etree as ET
from sastadev.external_functions import form_map
+from sastadev.methods import Method
from sastadev.query import Query
+from sastadev.readmethod import read_method
logger = logging.getLogger('sasta')
@@ -68,7 +69,8 @@ def upload_path(self, filename):
content = models.FileField(
upload_to=upload_path, blank=True, null=True, max_length=500)
category = models.ForeignKey(
- MethodCategory, related_name='definitions', blank=True, null=True, on_delete=models.CASCADE)
+ MethodCategory, related_name='definitions',
+ blank=True, null=True, on_delete=models.CASCADE)
def __str__(self):
return self.name
@@ -77,12 +79,10 @@ class Meta:
unique_together = (('category', 'name'))
get_latest_by = ('date_added', )
- def get_item_mapping(self, sep):
- queries = self.queries.all()
- mapping = {}
- for q in queries:
- mapping.update(q.get_item_mapping(sep))
- return mapping
+ def to_sastadev(self) -> Method:
+ cat_name = self.category.name.lower()
+ location = self.content.path
+ return read_method(cat_name, location)
class Corpus(models.Model):
@@ -94,9 +94,12 @@ class Corpus(models.Model):
date_added = models.DateField(auto_now_add=True)
date_modified = models.DateField(auto_now=True)
default_method = models.ForeignKey(AssessmentMethod,
- on_delete=models.SET_NULL, related_name='corpora', blank=True, null=True)
+ on_delete=models.SET_NULL,
+ related_name='corpora',
+ blank=True, null=True)
method_category = models.ForeignKey(
- MethodCategory, on_delete=models.SET_DEFAULT, default=1, related_name='corpora')
+ MethodCategory, on_delete=models.SET_DEFAULT,
+ default=1, related_name='corpora')
def __str__(self):
return self.name
@@ -176,9 +179,11 @@ def get_utterance_by_id(self, utt_id: int):
except Exception:
raise
- def get_filepaths(self) -> Tuple[str]:
+ def get_filepaths(self) -> Tuple:
if self.corrected_content:
- return (self.content.path, self.parsed_content.path, self.corrected_content.path)
+ return (self.content.path,
+ self.parsed_content.path,
+ self.corrected_content.path)
return (self.content.path, self.parsed_content.path)
@property
@@ -193,6 +198,13 @@ def convertable(self):
def parseable(self):
return self.status in (self.CONVERTED, self.PARSING_FAILED)
+ @property
+ def latest_run(self):
+ try:
+ return self.analysisruns.latest()
+ except AnalysisRun.DoesNotExist:
+ return None
+
class Utterance(models.Model):
sentence = models.CharField(max_length=500)
@@ -229,15 +241,17 @@ def for_analysis(self):
def word_elements(self) -> List[ET._Element]:
'''List of word elements, sorted by word (begin, end)'''
word_elements = self.syntree.findall('.//node[@word]')
- return sorted(word_elements, key=lambda x: (int(x.attrib.get('begin')), int(x.attrib.get('end'))))
+ return sorted(word_elements, key=lambda x: (int(x.attrib.get('begin')),
+ int(x.attrib.get('end'))))
@property
@functools.lru_cache(maxsize=128)
- def word_position_mapping(self) -> List[Tuple[Optional[int], Optional[int]]]:
+ def word_position_mapping(self) -> List[Dict]:
''' List of dictionaries (begin, end) for each word in the utterance
starts with { begin:None, end:None } to represent unaligned
'''
- mapping = [{'begin': int(el.attrib.get('begin')), 'end': int(el.attrib.get('end'))}
+ mapping = [{'begin': int(el.attrib.get('begin')),
+ 'end': int(el.attrib.get('end'))}
for el in self.word_elements]
return [{'begin': None, 'end': None}] + mapping
@@ -289,6 +303,7 @@ class AssessmentQuery(models.Model):
inform = models.CharField(max_length=20, blank=True, default='')
screening = models.CharField(max_length=20, blank=True, default=True)
process = models.IntegerField(blank=True, null=True)
+ literal = models.CharField(max_length=200, blank=True, default='')
stars = models.CharField(max_length=50, blank=True, default='')
filter = models.CharField(max_length=200, blank=True, default='')
variants = models.CharField(max_length=200, blank=True, default='')
@@ -312,19 +327,6 @@ def get_items_list(self, str, sep, lower=True):
return []
return cleanresult
- def get_item_mapping(self, sep):
- ''' mapping of all possible items (including altitems) to this query'''
- if (not self.item) or (not self.level):
- return {}
- result = {(clean_item(self.item), self.level.lower()):
- (self.query_id, self.fase)}
- if self.altitems:
- for item in self.altitems:
- if (clean_item(item), self.level.lower()) not in result:
- result[(clean_item(item), self.level.lower())] = (
- self.query_id, self.fase)
- return result
-
def to_sastadev(self) -> Query:
sastadev_mapping = {'query_id': 'id'}
processes = ['pre', 'core', 'post', 'form']
@@ -353,12 +355,14 @@ def upload_path(self, filename):
transcript = models.ForeignKey(
Transcript, related_name='analysisruns', on_delete=models.CASCADE)
method = models.ForeignKey(
- AssessmentMethod, related_name='analysisruns', on_delete=models.CASCADE)
+ AssessmentMethod, related_name='analysisruns',
+ on_delete=models.CASCADE)
created = models.DateTimeField(auto_now_add=True)
query_file = models.FileField(upload_to=upload_path, max_length=500)
annotation_file = models.FileField(upload_to=upload_path, max_length=500)
is_manual_correction = models.BooleanField(
- default=False, help_text='this run was generated by parsing a user-uploaded SAF-file')
+ default=False,
+ help_text='this run was generated by parsing a user-uploaded SAF-file')
class Meta:
get_latest_by = "created"
diff --git a/backend/analysis/query/functions.py b/backend/analysis/query/functions.py
deleted file mode 100644
index 3679379e..00000000
--- a/backend/analysis/query/functions.py
+++ /dev/null
@@ -1,122 +0,0 @@
-import logging
-from operator import attrgetter
-from typing import Callable, Dict, List, Union
-
-from analysis.models import AssessmentMethod, AssessmentQuery
-from analysis.results.results import UtteranceWord
-from analysis.score.zc_embedding import get_zc_embeddings
-from bs4 import BeautifulSoup as Soup
-from django.db.models import Q
-from lxml import etree as ET
-from sastadev.external_functions import form_map, str2functionmap
-from sastadev.macros import expandmacros, macrodict
-from sastadev.query import Query
-
-logger = logging.getLogger('sasta')
-
-
-class QueryWithFunction:
- def __init__(self, query, function):
- self.id: str = query.id
- self.query: Query = query
- self.function: Union[Callable, ET.XPath] = function
-
- def __repr__(self):
- return f'{self.id}: {type(self.function)}'
-
-
-def compile_queries(queries: List[Query]) -> List[QueryWithFunction]:
- results = []
- # macrodict = get_macros_dict()
-
- for query_model in queries:
- query = query_model.to_sastadev()
- func = compile_xpath_or_func(query.query, macrodict)
- if func:
- results.append(QueryWithFunction(query, func))
- return results
-
-
-def compile_xpath_or_func(query: str,
- macrodict: Dict) -> Union[Callable, ET.XPath]:
- try:
- if query in str2functionmap:
- return str2functionmap[query]
- expanded_query = expandmacros(query)
- return ET.XPath(expanded_query)
- except Exception as error:
- logger.warning(f'cannot compile {query.strip()}:\t{error}')
- return None
-
-
-def filter_queries(method: AssessmentMethod,
- phase: int = None,
- phase_exact: bool = True):
- '''
- # TODO: remove phase filtering?
- phase_exact:True returns only that phase
- False returns everything up to (and including) that phase
- '''
- try:
- form_queries = [f.__name__ for f in form_map.values()]
- all_queries = AssessmentQuery.objects.all().filter(
- Q(method=method)
- & Q(query__isnull=False)
- & ~Q(query__exact='')
- & ~Q(query__in=form_queries)
- & Q(inform='yes')
- )
- if phase:
- phase_filter = Q(fase=phase) if phase_exact else Q(
- fase__gte=phase)
- phase_queries = all_queries.filter(phase_filter)
- return phase_queries
- return all_queries
-
- except Exception as e:
- logger.warning(f'cannot filter queries for phase:\t{e}')
- print(e)
-
-
-def single_query_single_utt(query_func: Union[Callable, ET.XPath],
- syntree: ET._Element) -> List[ET._Element]:
- try:
- results = query_func(syntree)
- return results
- except Exception:
- logger.warning(f'Failed to execute {query_func}')
- return []
-
-
-def utt_from_tree(tree: str, embeddings=False) -> List[UtteranceWord]:
- # From a LASSY syntax tree, construct utterance representation
- # Output: sorted list of UtteranceWord instances
- soup = Soup(tree, 'lxml')
- utt = soup.alpino_ds
-
- embed_dict = get_zc_embeddings(ET.fromstring(tree)) if embeddings else None
-
- words = utt.findAll('node', {'word': True})
-
- unaligned = UtteranceWord(
- word='',
- begin=-1,
- end=0,
- hits=[],
- zc_embedding=0 if embed_dict else None
- )
-
- utt_words = [unaligned] + [UtteranceWord(
- word=w.get('word'),
- begin=w.get('begin'),
- end=w.get('end'),
- hits=[],
- zc_embedding=embed_dict[str(w.get('begin'))] if embed_dict else None)
- for w in words]
-
- # Sort the words and assign their real index
- sorted_words = sorted(utt_words, key=attrgetter('begin'))
- for i, w in enumerate(sorted_words):
- w.index = i
-
- return sorted_words
diff --git a/backend/analysis/query/query_transcript.py b/backend/analysis/query/query_transcript.py
new file mode 100644
index 00000000..a74b1689
--- /dev/null
+++ b/backend/analysis/query/query_transcript.py
@@ -0,0 +1,61 @@
+from typing import Tuple
+from analysis.models import AssessmentMethod, Transcript
+from sastadev.sastacore import SastaCoreParameters, sastacore
+from sastadev.targets import get_targets
+from lxml import etree
+from sastadev.methods import Method
+
+from annotations.reader import read_saf
+
+
+def prepare_parameters(infilename: str, method: Method, targets: int, annotationinput: bool) -> SastaCoreParameters:
+ return SastaCoreParameters(
+ annotationinput=annotationinput,
+ themethod=method,
+ infilename=infilename,
+ targets=targets
+ )
+
+
+def prepare_treebanks(transcript: Transcript) -> Tuple[Tuple[str, etree.ElementTree]]:
+ orig_fp = transcript.parsed_content.path
+ corr_fp = transcript.corrected_content.path
+ orig_treebank = etree.parse(orig_fp).getroot()
+ corr_treebank = etree.parse(corr_fp).getroot()
+ return (
+ (orig_fp, orig_treebank),
+ (corr_fp, corr_treebank)
+ )
+
+
+def run_sastacore(transcript: Transcript, method: AssessmentMethod, annotation_input: bool = False):
+ # get treebanks
+ orig_tb, corr_tb = prepare_treebanks(transcript)
+ # Retrieve targets from corrected treebank
+ targets = get_targets(corr_tb[1])
+ # Convert method to sastdaev version
+ sdmethod = method.to_sastadev()
+
+ if annotation_input:
+ existing_results = read_saf(
+ transcript.latest_run.annotation_file.path, sdmethod)
+ params = prepare_parameters(
+ transcript.latest_run.annotation_file.path,
+ sdmethod, targets, annotation_input)
+ res = sastacore(
+ origtreebank=None,
+ correctedtreebank=corr_tb[1],
+ annotatedfileresults=existing_results,
+ scp=params
+ )
+ else:
+ params = prepare_parameters(
+ corr_tb[0], sdmethod, targets, annotation_input)
+ res = sastacore(
+ origtreebank=orig_tb[1],
+ correctedtreebank=corr_tb[1],
+ annotatedfileresults=None,
+ scp=params
+ )
+
+ return res
diff --git a/backend/analysis/query/run.py b/backend/analysis/query/run.py
index 119f1905..6eb8462d 100644
--- a/backend/analysis/query/run.py
+++ b/backend/analysis/query/run.py
@@ -1,136 +1,17 @@
import logging
-from collections import Counter, defaultdict
-from typing import Dict, List, Set
-from analysis.annotations.safreader import SAFReader
-from analysis.models import (AnalysisRun, AssessmentMethod, AssessmentQuery,
- Transcript, Utterance)
-from analysis.results.results import AllResults, SastaMatches, SastaResults
-from sastadev.query import Query, core_process, post_process, pre_process
-
-from .functions import (QueryWithFunction, compile_queries, filter_queries,
- single_query_single_utt, utt_from_tree)
+from analysis.models import (AssessmentMethod, Transcript)
+from analysis.query.query_transcript import run_sastacore
+from sastadev.allresults import AllResults
logger = logging.getLogger('sasta')
-def query_transcript(transcript: Transcript,
- method: AssessmentMethod,
- annotate: bool = False,
- zc_embed: bool = False):
- # TODO: LOGGING
-
- queries: List[AssessmentQuery] = filter_queries(method)
- queries_with_funcs: List[QueryWithFunction] = compile_queries(queries)
- utterances: List[Utterance] = Utterance.objects.filter(
- transcript=transcript)
- to_analyze_utterances = [x for x in utterances if x.for_analysis]
- utterance_syntrees = [(x.utt_id, x.syntree) for x in to_analyze_utterances]
- allutts = {utt.utt_id: utt.word_list for utt in to_analyze_utterances}
- logger.info(
- f'Analyzing {len(to_analyze_utterances)} of {len(utterances)} utterances..')
-
- coreresults, allmatches, exact_results, corelevels, annotations = run_core_queries(
- to_analyze_utterances,
- queries_with_funcs,
- zc_embed,
- annotate)
-
- annotationinput = False
- runs = AnalysisRun.objects.filter(transcript=transcript)
- if runs: # An annotations file exists, base further results on this
- latest_run = runs.latest()
- reader = SAFReader(filepath=latest_run.annotation_file.path,
- method=method, transcript=transcript)
- coreresults = reader.document.to_allresults().coreresults
- annotations = reader.document.reformatted_annotations
- exact_results = reader.document.exactresults
- annotationinput = True
-
- allresults = AllResults(filename=transcript.name,
- uttcount=len(to_analyze_utterances),
- coreresults=coreresults,
- exactresults=exact_results,
- postresults=None,
- allmatches=allmatches,
- annotations=annotations,
- analysedtrees=utterance_syntrees,
- annotationinput=annotationinput,
- allutts=allutts
- )
-
- run_post_queries(allresults, queries_with_funcs)
- return allresults, queries_with_funcs
-
-
-def run_core_queries(utterances: List[Utterance],
- queries: List[QueryWithFunction],
- zc_embed: bool,
- annotate: bool):
- levels: Set[str] = set([])
- allmatches: SastaMatches = defaultdict(list)
- results: SastaResults = {}
- annotations = {}
- exact_results = defaultdict(list)
-
- core_queries: List[QueryWithFunction] = sorted(
- [q for q in queries if q.query.process in [pre_process, core_process]],
- key=lambda x: (x.query.process, x.query.id))
-
- for utt in utterances:
- if annotate:
- utt_res = utt_from_tree(utt.parse_tree, zc_embed)
- for q in core_queries:
- matches = single_query_single_utt(q.function, utt.syntree)
- if matches:
- if q.id in results:
- results[q.id].update(
- {utt.utt_id: len(matches)})
- else:
- results[q.id] = Counter(
- {utt.utt_id: len(matches)})
- for m in matches:
- levels.add(q.query.level)
- # Record the match including the syntree
- allmatches[(q.id, utt.utt_id)].append((m, utt.syntree))
- # Record the exact word where the query was matched
-
- word_index = next((i for i, item in enumerate(
- utt.word_position_mapping) if item["begin"] == int(m.get('begin'))), None)
- # exact_results[q.id].append((utt.utt_id, int(m.get('begin')) + 1))
- exact_results[q.id].append((utt.utt_id, word_index))
-
- if annotate:
- begin = int(m.get('begin'))
- hit = {
- 'level': q.query.level,
- 'item': q.query.item,
- 'fase': q.query.fase
- }
- matched_word = next(
- (w for w in utt_res if w.begin == begin), None)
- if matched_word:
- matched_word.hits.append(hit)
- else:
- logger.warning(
- f'Found hit ({q.query.level}, {q.query.item}, {q.query.fase}) for non-exising begin attr "{begin}"')
- if annotate:
- annotations[utt.utt_id] = utt_res
-
- return (results, allmatches, exact_results, levels, annotations or None)
-
-
-def run_post_queries(allresults: SastaResults,
- queries: List[QueryWithFunction]) -> None:
- post_queries: List[QueryWithFunction] = [
- q for q in queries if q.query.process == post_process]
- flat_queries: Dict[str, Query] = {q.id: q.query for q in queries}
-
- for q in post_queries:
- try:
- result = q.function(allresults, flat_queries)
- if result is not None:
- allresults.postresults[q.id] = result
- except Exception as e:
- # logger.warning(f'Failed to execute {q.function}')
- logger.exception(e)
+def annotate_transcript(transcript: Transcript, method: AssessmentMethod, ignore_existing: bool = False) -> AllResults:
+ if transcript.latest_run and not ignore_existing:
+ # run sastacore with pre-exising SAF file
+ allresults, _samplesize = run_sastacore(transcript, method, True)
+ else:
+ # run sastacore normally
+ allresults, _samplesize = run_sastacore(transcript, method, False)
+ return allresults
diff --git a/backend/analysis/query/xlsx_output.py b/backend/analysis/query/xlsx_output.py
deleted file mode 100644
index b14d5792..00000000
--- a/backend/analysis/query/xlsx_output.py
+++ /dev/null
@@ -1,248 +0,0 @@
-import traceback
-from collections import Counter
-from typing import List, Tuple
-from analysis.annotations.constants import SAF_COMMENT_LEVEL, SAF_UNALIGNED_LEVEL
-
-from analysis.models import AssessmentMethod
-from analysis.query.functions import QueryWithFunction
-from analysis.results.results import AllResults
-from openpyxl import Workbook
-from openpyxl.styles import Font, PatternFill
-from openpyxl.styles.protection import Protection
-from openpyxl.utils import get_column_letter
-from openpyxl.worksheet.dimensions import ColumnDimension, DimensionHolder
-
-ROMAN_NUMS = [None, 'I', 'II', 'III',
- 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X']
-
-BEFORE_WORDS_HEADERS = ['ID', 'Level', SAF_UNALIGNED_LEVEL]
-AFTER_WORDS_HEADERS = ['Dummy', 'Fases', SAF_COMMENT_LEVEL]
-
-
-def querycounts_to_xlsx(allresults: AllResults, queries: List[QueryWithFunction]):
- all_data = dict(allresults.coreresults, **allresults.postresults)
-
- wb = Workbook()
- worksheet = wb.active
-
- # header
- worksheet.append(['Query', 'Item', 'Fase', 'Utterance', 'Matches'])
- header = worksheet["1:1"]
- for cell in header:
- cell.font = Font(bold=True)
-
- query_mapping = {
- q.query.id: (q.query.fase or 0, q.query.item)
- for q in queries
- if q.query.id in all_data
- }
- sorted_queries = sorted(
- sorted(
- query_mapping.items(),
- key=lambda item: item[0]
- ),
- key=lambda item: item[1][0]
- )
-
- for qid, (fase, item) in sorted_queries:
- fase = fase if fase else 'nvt'
- data = all_data[qid]
-
- if isinstance(data, int):
- row = [qid, item, fase, 'total', data]
- worksheet.append(row)
- elif isinstance(data, Counter):
- first_row = [qid, item, fase, 'total', sum(data.values())]
- worksheet.append(first_row)
- for utt in sorted(data):
- if isinstance(utt, Tuple):
- row = [None, None, None, utt[-1], data[utt]]
- else:
- row = [None, None, None, utt, data[utt]]
- worksheet.append(row)
-
- worksheet.auto_filter.ref = worksheet.dimensions
-
- # column widths
- autosize_columns(worksheet)
-
- return wb
-
-
-def annotations_to_xlsx(allresults, method):
- try:
- wb = Workbook()
- worksheet = wb.active
-
- items = sorted(allresults.annotations.items())
- max_words = max([len(words) for (_, words) in items])
- headers = get_headers(max_words)
- worksheet.append(headers)
-
- zc_embeddings = method.category.zc_embeddings
-
- levels, lower_levels = get_levels(method)
-
- for utt_id, words in items:
- # Utt row, containing the word tokens
- words_row = [utt_id, 'Utt'] + [w.word for w in words]
-
- # a cell for each word, and one to record phases
- level_rows = make_levels_rows(max_words, levels, utt_id)
-
- if zc_embeddings:
- zc_rows = make_zc_rows(max_words, utt_id, words)
- else:
- zc_rows = None
-
- comment_rows = make_levels_rows(max_words, ['Commentaar'], utt_id)
-
- for word in words:
- process_word(zc_embeddings, lower_levels, level_rows, zc_rows, comment_rows, word.index, word)
-
- append_utterance_rows(
- worksheet,
- words_row,
- level_rows,
- zc_rows,
- comment_rows
- )
-
- format_worksheet(worksheet)
- autosize_columns(worksheet)
-
- return wb
-
- except Exception:
- traceback.print_exc()
-
-
-def process_word(zc_embeddings, lower_levels, level_rows, zc_rows, comment_rows, i_word, word) -> None:
- '''Iterate over word hits and fill the corresponding level'''
- for hit in word.hits:
- if zc_embeddings and hit['level'].lower() == 'zc':
- i_level = word.zc_embedding
- process_hit(zc_rows, i_word, hit, i_level)
- else:
- i_level = lower_levels.index(hit['level'].lower())
- process_hit(level_rows, i_word, hit, i_level)
- if word.comments:
- comment_rows[0][get_word_column(i_word)].add(word.comments)
-
-
-def process_hit(rows, i_word: int, hit, i_level: int) -> None:
- '''Add the hit to the right place in the rows, and append the fase as roman numeral'''
- rows[i_level][get_word_column(i_word)].add(hit['item'])
- try:
- rows[i_level][-1].append(
- ROMAN_NUMS[int(hit['fase'])])
- except Exception:
- pass
-
-
-def get_word_column(word_index: int) -> int:
- # Substract 1 because position 0 is unaligned, which is present in BEFORE_WORDS_HEADERS
- return word_index + len(BEFORE_WORDS_HEADERS) - 1
-
-
-def append_utterance_rows(worksheet, words_row, levels_rows, zc_rows, comment_rows) -> None:
- '''Append all rows for an utterance:
- words
- levels
- zc levels (optional)
- '''
- worksheet.append(words_row)
- append_level_rows(levels_rows, worksheet)
- append_level_rows(zc_rows, worksheet)
- append_level_rows(comment_rows, worksheet)
-
-
-def concat_cell(cell):
- if (isinstance(cell, set) or isinstance(cell, list)):
- try:
- return ','.join(sorted(cell)) or None
- except Exception:
- return None
- return cell
-
-
-def append_level_rows(rows, worksheet) -> None:
- '''Condense cells to comma separated strings and append them to worksheet'''
- if not rows:
- return
- for row in rows:
- row = [concat_cell(cell)
- for cell in row]
- worksheet.append(row)
-
-
-def make_levels_rows(max_words: int, levels: List[str], utt_id: int):
- level_rows = [
- [utt_id, level]
- + [set([])] # unaligned
- + [set([]) for _ in range(max_words + 1)]
- + [[]] # fases
- # Everything after fases is undefined so fases are easy to find with -1
- for level in levels
- ]
- return level_rows
-
-
-def make_zc_rows(max_words: int, utt_id: int, words):
- '''Rows for Zc levels. At least one, but more if deeper embeddings are present.
- '''
- embed_levels = {w.zc_embedding for w in words}
- max_embed = max(embed_levels)
- zc_levels = ['Zc'] * (max_embed + 1) # N + 1 Zc levels
- return make_levels_rows(max_words, zc_levels, utt_id)
-
-
-def get_headers(max_words: int) -> List[str]:
- word_headers = [f'Word{i}' for i in range(1, max_words + 1)]
- headers = BEFORE_WORDS_HEADERS + word_headers + AFTER_WORDS_HEADERS
-
- return headers
-
-
-def get_levels(method: AssessmentMethod) -> Tuple[List[str], List[str]]:
- '''Lowercased list of all levels (excluding ZC)'''
- levels = method.category.levels
- if method.category.zc_embeddings:
- levels = [lv for lv in levels if lv.lower() != 'Zc'.lower()]
- lower_levels = list(map(str.lower, levels))
- return levels, lower_levels
-
-
-def format_worksheet(worksheet) -> None:
- '''Locks all cells except annotation fields. Gives utterance rows a yellow background.'''
-
- # start by locking the entire sheet
- worksheet.protection.sheet = True
- unlocked = Protection(locked=False)
-
- header = worksheet["1:1"]
- for cell in header:
- # bold headers
- cell.font = Font(bold=True)
-
- # yelow background for each utterance row
- for row in list(worksheet.rows)[1:]:
- if row[1].value == 'Utt':
- for cell in row:
- cell.fill = PatternFill(
- start_color="ffff00",
- end_color="ffff00",
- fill_type="solid")
- else:
- # unlock non-utterance rows
- # skip the first two columns (utt number and level)
- for cell in row[2:]:
- cell.protection = unlocked
-
-
-def autosize_columns(worksheet) -> None:
- dim_holder = DimensionHolder(worksheet=worksheet)
- for col in range(worksheet.min_column, worksheet.max_column + 1):
- dim_holder[get_column_letter(col)] = ColumnDimension(
- worksheet, min=col, max=col, auto_size=True)
- worksheet.column_dimensions = dim_holder
diff --git a/backend/analysis/score/__init__.py b/backend/analysis/score/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/backend/analysis/score/testfiles/zc_embed_test.xml b/backend/analysis/score/testfiles/zc_embed_test.xml
deleted file mode 100644
index 6f4d9eae..00000000
--- a/backend/analysis/score/testfiles/zc_embed_test.xml
+++ /dev/null
@@ -1,28 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- moet ook ergens een bakje waar dit in kan
-
-
\ No newline at end of file
diff --git a/backend/analysis/score/zc_embedding.py b/backend/analysis/score/zc_embedding.py
deleted file mode 100644
index 06a52f0a..00000000
--- a/backend/analysis/score/zc_embedding.py
+++ /dev/null
@@ -1,45 +0,0 @@
-
-from typing import Dict
-
-CLAUSALS = ['smain', 'rel', 'whrel', 'whsub', 'whq', 'sv1']
-
-
-def is_token(node):
- return any(x in ['pt', 'pos'] for x in node.keys())
-
-
-def has_cat(node):
- return 'cat' in node.keys()
-
-
-def is_clausal(node):
- return is_direct_clausal(node) or is_child_clausal(node)
-
-
-def is_direct_clausal(node):
- return node.attrib.get('cat') in CLAUSALS
-
-
-def is_child_clausal(node):
- return node.attrib.get('cat') in ['cp'] and \
- any(n.attrib.get('cat') in ['ssub', 'ti'] for n in list(node))
-
-
-def solve(node, embed, results):
- if is_token(node):
- results[node.attrib['begin']] = embed
- if has_cat(node) and is_clausal(node):
- embed += 1
- for child in node.getchildren():
- solve(child, embed, results)
- return results
-
-
-def get_zc_embeddings(syntree) -> Dict[str, int]:
- try:
- root = syntree.getroot()
- except Exception:
- root = syntree
- top_node = root.find('node')
- results = solve(top_node, 0, {})
- return results
diff --git a/backend/analysis/score/zc_embedding_test.py b/backend/analysis/score/zc_embedding_test.py
deleted file mode 100644
index 79071521..00000000
--- a/backend/analysis/score/zc_embedding_test.py
+++ /dev/null
@@ -1,16 +0,0 @@
-import os.path as op
-
-from lxml import etree as ET
-
-from .zc_embedding import get_zc_embeddings
-
-HERE = op.dirname(op.abspath(__file__))
-FILES = op.join(HERE, 'testfiles')
-
-
-def test_zc_embed():
- tree = ET.parse(op.join(FILES, 'zc_embed_test.xml'))
- expected_embeddings = [1, 1, 1, 1, 1, 2, 2, 2, 2]
- word_indices = [str(x) for x in range(0, 9)]
- expected = dict(zip(word_indices, expected_embeddings))
- assert expected == get_zc_embeddings(tree)
diff --git a/backend/analysis/views.py b/backend/analysis/views.py
index aaee2b0a..23a7b85c 100644
--- a/backend/analysis/views.py
+++ b/backend/analysis/views.py
@@ -5,10 +5,11 @@
import logging
from io import BytesIO, StringIO
-from analysis.annotations.enrich_chat import enrich_chat
-from analysis.annotations.safreader import SAFReader
-from analysis.query.run import query_transcript
-from analysis.query.xlsx_output import annotations_to_xlsx, querycounts_to_xlsx
+from analysis.query.run import annotate_transcript
+from annotations.reader import read_saf
+from annotations.writers.querycounts import querycounts_to_xlsx
+from annotations.writers.saf_chat import enrich_chat
+from annotations.writers.saf_xlsx import SAFWriter
from celery import group
from convert.chat_writer import ChatWriter
from django.db.models import Q
@@ -25,8 +26,9 @@
from .models import (AnalysisRun, AssessmentMethod, Corpus, MethodCategory,
Transcript, UploadFile)
from .permissions import IsCorpusChildOwner, IsCorpusOwner
-from .serializers import (AssessmentMethodSerializer, CorpusDetailsSerializer, CorpusListSerializer,
- MethodCategorySerializer, TranscriptDetailsSerializer,
+from .serializers import (AssessmentMethodSerializer, CorpusDetailsSerializer,
+ CorpusListSerializer, MethodCategorySerializer,
+ TranscriptDetailsSerializer,
TranscriptListSerializer, UploadFileSerializer)
from .utils import StreamFile
@@ -85,28 +87,29 @@ def query(self, request, *args, **kwargs):
content_type=SPREADSHEET_MIMETYPE)
response['Content-Disposition'] = "attachment; filename=matches_output.xlsx"
- allresults, queries_with_funcs = query_transcript(transcript, method)
+ allresults = annotate_transcript(transcript, method)
- spreadsheet = querycounts_to_xlsx(allresults, queries_with_funcs)
+ spreadsheet = querycounts_to_xlsx(allresults, method)
spreadsheet.save(response)
return response
@action(detail=True, methods=['POST'], name='Annotate')
def annotate(self, request, *args, **kwargs):
+ # Retrieve objects
transcript = self.get_object()
method_id = request.data.get('method')
-
method = AssessmentMethod.objects.get(pk=method_id)
- zc_embed = method.category.zc_embeddings
- allresults, queries_with_funcs = query_transcript(
- transcript, method, True, zc_embed
- )
+ # Perform the actual querying
+ allresults = annotate_transcript(transcript, method)
- spreadsheet = annotations_to_xlsx(allresults, method)
+ # Always create an XLSX file for AnalysisRun purposes
+ writer = SAFWriter(method.to_sastadev(), allresults)
+ spreadsheet = writer.workbook
self.create_analysis_run(transcript, method, spreadsheet)
+ # Adapt output to requested format
format = request.data.get('format', 'xlsx')
if format == 'xlsx':
@@ -161,33 +164,33 @@ def upload_annotations(self, request, *args, **kwargs):
new_run = self.create_analysis_run(obj, latest_run.method, file, is_manual=True)
try:
- reader = SAFReader(new_run.annotation_file.path, latest_run.method, obj)
+ read_saf(new_run.annotation_file.path,
+ latest_run.method.to_sastadev())
except Exception as e:
new_run.delete()
logger.exception(e)
return Response(str(e), status.HTTP_400_BAD_REQUEST)
- if reader.errors:
- new_run.delete()
- return Response(reader.formatted_errors(), status.HTTP_400_BAD_REQUEST)
+ # TODO: re-enable proper error logging for reading SAF files
+ # if reader.errors:
+ # new_run.delete()
+ # return Response(reader.formatted_errors(), status.HTTP_400_BAD_REQUEST)
return Response('Success', status.HTTP_200_OK)
@action(detail=True, methods=['POST'], name='Generate form')
def generateform(self, request, *args, **kwargs):
+ # Retrieve objects
transcript = self.get_object()
method_id = request.data.get('method')
method = AssessmentMethod.objects.get(pk=method_id)
- zc_embed = method.category.zc_embeddings
# Find the form function for this method
form_func = method.category.get_form_function()
if not form_func:
raise ParseError(detail='No form definition for this method.')
- allresults, _ = query_transcript(
- transcript, method, annotate=False, zc_embed=zc_embed,
- )
+ allresults = annotate_transcript(transcript, method)
form = form_func(allresults, None, in_memory=True)
diff --git a/backend/analysis/annotations/__init__.py b/backend/annotations/__init__.py
similarity index 100%
rename from backend/analysis/annotations/__init__.py
rename to backend/annotations/__init__.py
diff --git a/backend/annotations/apps.py b/backend/annotations/apps.py
new file mode 100644
index 00000000..ecaea75e
--- /dev/null
+++ b/backend/annotations/apps.py
@@ -0,0 +1,6 @@
+from django.apps import AppConfig
+
+
+class AnnotationsConfig(AppConfig):
+ default_auto_field = 'django.db.models.BigAutoField'
+ name = 'annotations'
diff --git a/backend/annotations/conftest.py b/backend/annotations/conftest.py
new file mode 100644
index 00000000..00eabfe2
--- /dev/null
+++ b/backend/annotations/conftest.py
@@ -0,0 +1,11 @@
+
+import pytest
+
+from annotations.writers.saf_xlsx import SAFWriter
+
+
+@pytest.fixture
+def safwriter(asta_method, single_utt_allresults):
+ sd_method = asta_method.to_sastadev()
+ writer = SAFWriter(sd_method, single_utt_allresults)
+ return writer
diff --git a/backend/annotations/constants.py b/backend/annotations/constants.py
new file mode 100644
index 00000000..7785c74f
--- /dev/null
+++ b/backend/annotations/constants.py
@@ -0,0 +1,32 @@
+from annotations.utils import preflabel
+from sastadev.SAFreader import (commentsheaders, levelheaders, literallevels,
+ speakerheaders, stagesheaders,
+ unalignedheaders, uttidheaders)
+
+SAF_COMMENT_LEVEL = preflabel(commentsheaders, str.capitalize)
+SAF_COMMENT_COLUMN = preflabel(commentsheaders, str.capitalize)
+SAF_COMMENT_HEADERS = list(map(str.lower, commentsheaders))
+
+SAF_UTT_HEADER = SAF_UTT_LEVEL = preflabel(uttidheaders, str.capitalize)
+SAF_UTT_LEVELS = list(map(str.lower, uttidheaders))
+
+SAF_UNALIGNED_LEVEL = preflabel(unalignedheaders, str.capitalize)
+SAF_UNALIGNED_LEVELS = list(map(str.lower, unalignedheaders))
+
+SAF_LEVEL_HEADER = preflabel(levelheaders, str.capitalize)
+SAF_LEVEL_HEADERS = list(map(str.lower, levelheaders))
+
+SAF_FASES_COLUMN = preflabel(stagesheaders, str.capitalize)
+SAF_FASES_HEADERS = list(map(str.lower, stagesheaders))
+
+SAF_SPEAKER_HEADER = preflabel(speakerheaders, str.capitalize)
+SAF_SPEAKER_COLUMNS = list(map(str.lower, speakerheaders))
+
+SAF_LITERAL_LEVELS = list(map(str.lower, literallevels))
+
+# Composed headers
+PRE_WORDS_HEADERS = [SAF_UTT_HEADER, SAF_LEVEL_HEADER, SAF_UNALIGNED_LEVEL]
+POST_WORDS_HEADERS = [SAF_FASES_COLUMN, SAF_COMMENT_COLUMN]
+
+PRIMARY_COLOR = '3f51b5'
+SECONDARY_COLOR = 'b5a33f'
diff --git a/backend/annotations/reader.py b/backend/annotations/reader.py
new file mode 100644
index 00000000..03f11de2
--- /dev/null
+++ b/backend/annotations/reader.py
@@ -0,0 +1,20 @@
+from sastadev.allresults import AllResults
+from sastadev.methods import Method
+from sastadev.SAFreader import get_golddata, richscores2scores
+
+
+def read_saf(saf_filename: str, method: Method, includeimplies: bool = False) -> AllResults:
+ '''Wrapper around SASTADEV SAF reader'''
+ infilename = saf_filename
+ allutts, richexactscores = get_golddata(infilename, method, includeimplies)
+ exactresults = richscores2scores(richexactscores)
+ annotatedfileresults = AllResults(uttcount=len(allutts),
+ coreresults={},
+ exactresults=exactresults,
+ postresults={},
+ allmatches={},
+ filename=infilename,
+ analysedtrees=[],
+ allutts=allutts,
+ annotationinput=True)
+ return annotatedfileresults
diff --git a/backend/annotations/utils.py b/backend/annotations/utils.py
new file mode 100644
index 00000000..fa3cd833
--- /dev/null
+++ b/backend/annotations/utils.py
@@ -0,0 +1,64 @@
+from typing import Any, List, Optional
+
+from annotations import constants
+from openpyxl.styles import Font, PatternFill
+from openpyxl.styles.protection import Protection
+from openpyxl.utils import get_column_letter
+from openpyxl.worksheet.dimensions import ColumnDimension, DimensionHolder
+from sastadev.allresults import AllResults
+
+
+def preflabel(labels: List[str], casing: Optional[callable] = None) -> str:
+ try:
+ label = labels[0]
+ return casing(label) if casing else label
+ except IndexError:
+ return ''
+
+
+def get_max_words(allresults: AllResults) -> int:
+ '''Get the length of the longest utterance in the results'''
+ return max(len(v) for v in allresults.allutts.values())
+
+
+def ljust(li: List[Any], n: int, fillvalue=None) -> List[Any]:
+ '''Pad the list with fillvalues up to N'''
+ return li + [fillvalue] * (n - len(li))
+
+
+def format_worksheet(worksheet) -> None:
+ '''Locks all cells except annotation fields.
+ Gives utterance rows a primary background.
+ '''
+
+ # start by locking the entire sheet
+ worksheet.protection.sheet = True
+ unlocked = Protection(locked=False)
+
+ header = worksheet["1:1"]
+ for cell in header:
+ # bold headers
+ cell.font = Font(bold=True)
+
+ # yelow background for each utterance row
+ for row in list(worksheet.rows)[1:]:
+ if row[1].value == constants.SAF_UTT_LEVEL:
+ for cell in row:
+ cell.font = Font(color='FFFFFF')
+ cell.fill = PatternFill(
+ start_color=constants.PRIMARY_COLOR,
+ end_color=constants.PRIMARY_COLOR,
+ fill_type="solid")
+ else:
+ # unlock non-utterance rows
+ # skip the first two columns (utt number and level)
+ for cell in row[2:]:
+ cell.protection = unlocked
+
+
+def autosize_columns(worksheet) -> None:
+ dim_holder = DimensionHolder(worksheet=worksheet)
+ for col in range(worksheet.min_column, worksheet.max_column + 1):
+ dim_holder[get_column_letter(col)] = ColumnDimension(
+ worksheet, min=col, max=col, auto_size=True)
+ worksheet.column_dimensions = dim_holder
diff --git a/backend/annotations/writer_tests.py b/backend/annotations/writer_tests.py
new file mode 100644
index 00000000..eceda8a6
--- /dev/null
+++ b/backend/annotations/writer_tests.py
@@ -0,0 +1,49 @@
+from annotations.constants import (SAF_COMMENT_COLUMN, SAF_FASES_COLUMN,
+ SAF_LEVEL_HEADER, SAF_UNALIGNED_LEVEL,
+ SAF_UTT_HEADER)
+from annotations.writers.saf_xlsx import SAFWriter
+
+from .utils import ljust
+
+
+def test_safwriter(safwriter: SAFWriter):
+ with open('/Users/a3248526/Documents/saf_writer_test.xlsx', 'wb') as f:
+ safwriter.write(f)
+ assert safwriter
+
+
+def test_headers(safwriter: SAFWriter):
+ found = safwriter._annotations_header_row()
+ expected = [SAF_UTT_HEADER, SAF_LEVEL_HEADER, SAF_UNALIGNED_LEVEL,
+ *[f'Word{n}' for n in range(1, 19)],
+ SAF_FASES_COLUMN, SAF_COMMENT_COLUMN]
+ assert found == expected
+
+
+def test_uttlevel_row(safwriter: SAFWriter):
+ id = 1
+ words = safwriter.results.allutts[id]
+ found = safwriter._uttlevel_row(id, words)
+ expected = [1, SAF_UTT_HEADER, None, 'ja', 'uh', 'ik', 'vind', 'het',
+ 'beetje', 'moeilijk', 'om', 'het', 'goed', 'te', 'vertellen',
+ 'want', 'ik', 'heb', 'een', 'ongeluk', 'gehad', None, None]
+ assert found == expected
+
+
+def test_ljust_list():
+ input = ['a', 'b', 'c']
+ ljustified = ljust(input, 5)
+ assert ljustified == input + [None, None]
+ ljustified = ljust(input, 3)
+ assert ljustified == input
+ ljustified = ljust(input, 2)
+ assert ljustified == input
+
+
+def test_uttlevel_offset(safwriter: SAFWriter):
+ assert safwriter._uttlevel_row_number(0, 'Samplegrootte') == 3
+ assert safwriter._uttlevel_row_number(0, 'Taalmaat') == 5
+ assert safwriter._uttlevel_row_number(0, 'Opmerkingen') == 8
+ assert safwriter._uttlevel_row_number(2, 'Samplegrootte') == 17
+ assert safwriter._uttlevel_row_number(2, 'Taalmaat') == 19
+ assert safwriter._uttlevel_row_number(2, 'Opmerkingen') == 22
diff --git a/backend/annotations/writers/querycounts.py b/backend/annotations/writers/querycounts.py
new file mode 100644
index 00000000..4a546980
--- /dev/null
+++ b/backend/annotations/writers/querycounts.py
@@ -0,0 +1,68 @@
+from collections import Counter, defaultdict
+
+from analysis.models import AssessmentMethod
+from annotations.constants import SAF_FASES_COLUMN, SAF_UTT_HEADER
+from natsort import natsorted
+from openpyxl import Workbook
+from openpyxl.styles import Font
+from openpyxl.utils import get_column_letter
+from openpyxl.worksheet.dimensions import ColumnDimension, DimensionHolder
+from sastadev.allresults import AllResults
+from sastadev.reduceresults import exact2results
+
+QUERYCOUNT_HEADERS = ['Query', 'Item',
+ SAF_FASES_COLUMN, SAF_UTT_HEADER, 'Matches']
+
+TOTAL_LABEL = 'totaal'
+NOT_APPLICABLE_LABEL = 'nvt'
+
+
+def querycounts_to_xlsx(allresults: AllResults, method: AssessmentMethod):
+ wb = Workbook()
+ worksheet = wb.active
+
+ # header
+ worksheet.append(QUERYCOUNT_HEADERS)
+ header = worksheet["1:1"]
+ for cell in header:
+ cell.font = Font(bold=True)
+
+ nonempty_queries = {k: v for k, v in allresults.exactresults.items() if v}
+
+ res = exact2results(nonempty_queries)
+
+ # need to reduce the results
+ # because of double results for lemma queries
+ reduced_results = defaultdict(Counter)
+ for (k, _), v in res.items():
+ reduced_results[k] += v
+
+ # write rows of data
+ for qid in natsorted(reduced_results):
+ # get query info
+ cntr = reduced_results[qid]
+ q = method.queries.get(query_id=qid)
+
+ # write the total row
+ total_row = [qid, q.item, q.fase or NOT_APPLICABLE_LABEL,
+ TOTAL_LABEL, sum(cntr.values())]
+ worksheet.append(total_row)
+
+ for utt_id in natsorted(cntr.keys()):
+ row = [None, None, None, utt_id, cntr[utt_id]]
+ worksheet.append(row)
+
+ worksheet.auto_filter.ref = worksheet.dimensions
+
+ # column widths
+ autosize_columns(worksheet)
+
+ return wb
+
+
+def autosize_columns(worksheet) -> None:
+ dim_holder = DimensionHolder(worksheet=worksheet)
+ for col in range(worksheet.min_column, worksheet.max_column + 1):
+ dim_holder[get_column_letter(col)] = ColumnDimension(
+ worksheet, min=col, max=col, auto_size=True)
+ worksheet.column_dimensions = dim_holder
diff --git a/backend/annotations/writers/saf_chat.py b/backend/annotations/writers/saf_chat.py
new file mode 100644
index 00000000..36793db7
--- /dev/null
+++ b/backend/annotations/writers/saf_chat.py
@@ -0,0 +1,66 @@
+from collections import defaultdict
+from typing import Dict, List
+
+from analysis.models import AssessmentMethod, Transcript
+from analysis.results.results import AllResults
+from chamd.chat_reader import ChatLine, ChatTier
+from convert.chat_reader import ChatDocument
+from natsort import natsorted
+from sastadev.sastatypes import ExactResultsDict
+
+
+def _items_by_utt_word(exactresults: ExactResultsDict, items_mapping: Dict) -> Dict:
+ # filter out empty
+ results = {k: v for k, v in exactresults.items() if v}
+
+ # create nested defaultdict: dictionary of dictionaries of lists
+ out = defaultdict(lambda: defaultdict(list))
+
+ for (qid, _), hits in results.items():
+ for (utt_id, wordno) in hits:
+ out[utt_id][wordno].append(items_mapping.get(qid))
+
+ return out
+
+
+def _find_doc_line(lines: List[ChatLine], uttno: int) -> ChatLine:
+ # TODO: more efficient way to do this?
+ return next((x for x in lines if x.uttid == uttno), None)
+
+
+def enrich_chat(transcript: Transcript,
+ allresults: AllResults,
+ method: AssessmentMethod) -> ChatDocument:
+ doc = ChatDocument.from_chatfile(
+ transcript.content.path, transcript.corpus.method_category)
+
+ # construct a mapping of uttno to uttid
+ # because uttid is unknown to CHAT
+ marked_utts = (x for x in transcript.utterances.all() if x.for_analysis)
+ id_no_mapping = {
+ u.utt_id: u.uttno for u in marked_utts
+ }
+
+ # create mapping of query_ids to items
+ items_mapping = {q.query_id: q.item for q in method.queries.all()}
+
+ results_by_word = _items_by_utt_word(
+ allresults.exactresults, items_mapping)
+
+ for utt_id, words in results_by_word.items():
+ uttno = id_no_mapping.get(int(utt_id))
+ doc_line = _find_doc_line(doc.lines, uttno)
+
+ utt_hits = []
+ for w in natsorted(words.keys()):
+ utt_hits.extend(words[w])
+
+ annotation_str = ', '.join(utt_hits)
+ doc_line.tiers['xsyn'] = ChatTier(id='xsyn', text=annotation_str)
+ # id_headers = [h for h in doc.headers if h.line.startswith('@ID')]
+ # last_id_header = max(id_headers, key=attrgetter('linestartno'))
+ # doc.headers.append(ChatHeader(
+ # line=f'@Comment:\tAnnotations on %xsyn-tiers generated by SASTA, using {method.category.name}',
+ # linestartno=last_id_header.linestartno+1))
+
+ return doc
diff --git a/backend/annotations/writers/saf_xlsx.py b/backend/annotations/writers/saf_xlsx.py
new file mode 100644
index 00000000..7dbf1010
--- /dev/null
+++ b/backend/annotations/writers/saf_xlsx.py
@@ -0,0 +1,171 @@
+import itertools
+from dataclasses import dataclass, field
+from io import BytesIO
+from typing import Dict, List, Tuple
+
+from analysis.models import MethodCategory
+from openpyxl import Workbook
+from openpyxl.cell.cell import Cell
+from openpyxl.worksheet.worksheet import Worksheet
+from sastadev.allresults import AllResults, ResultsKey
+from sastadev.methods import Method
+from sastadev.sastatypes import ExactResults
+from annotations.constants import (POST_WORDS_HEADERS, PRE_WORDS_HEADERS,
+ SAF_COMMENT_LEVEL, SAF_UTT_LEVEL)
+from annotations.utils import autosize_columns, format_worksheet, get_max_words, ljust
+from natsort import natsorted
+
+
+@dataclass
+class SAFWriter():
+ method: Method
+ results: AllResults
+ workbook: Workbook = field(init=False, default_factory=Workbook)
+ anno_ws: Worksheet = field(init=False)
+ method_category: MethodCategory = field(init=False)
+ # Number of non-word columns, counted from 0
+ word_offset: int = field(default=len(PRE_WORDS_HEADERS), init=False)
+ # Number of words of the longest utterance in the results
+ max_words: int = field(init=False)
+ # Offset for row (1 - len(levels)) below utt row (0)
+ level_offsets: Dict[str, int] = field(init=False)
+ # Number of rows each utterance takes up (utt + level rows)
+ utt_n_rows: int = field(init=False)
+ # header row
+ anno_headers: List[str] = field(init=False)
+
+ def __post_init__(self) -> None:
+ self.max_words = get_max_words(self.results)
+ self.method_category = MethodCategory.objects.get(
+ name=self.method.name.upper())
+
+ all_levels = [SAF_UTT_LEVEL,
+ *self.method_category.levels,
+ SAF_COMMENT_LEVEL]
+ self.level_offsets = {
+ level.lower(): index
+ for (index, level)
+ in enumerate(all_levels)
+ }
+ self.utt_n_rows = (len(all_levels))
+ self.anno_headers = self._annotations_header_row()
+ self.make_workbook()
+
+ def write(self, target: BytesIO) -> None:
+ '''Write the completed output file'''
+ self.workbook.save(target)
+
+ def make_workbook(self) -> None:
+ '''Create the complete workbook.
+ Any additional required sheets should created in this method.
+ '''
+ _ = self._make_annotations_worksheet()
+ format_worksheet(self.anno_ws)
+ autosize_columns(self.anno_ws)
+ errors_ws = self.workbook.create_sheet('error', 1)
+ errors_ws.cell(1, 1).value = 'Hier komen errors'
+
+ def _make_annotations_worksheet(self) -> Worksheet:
+ '''Transform results into a SAF-formatted worksheet'''
+ self.anno_ws = self.workbook.create_sheet(title='annotations', index=0)
+ # Headers
+ self.anno_ws.append(self.anno_headers)
+
+ # Rest
+ self._make_levels_rows(self.anno_ws)
+
+ # Fill with values
+ for qid, qresults in self.results.exactresults.items():
+ self._fill_query(qid, qresults)
+ return self.anno_ws
+
+ def _annotations_header_row(self) -> List[str]:
+ '''Create header row with correct number of word columns'''
+ word_headers = [f'Word{i}' for i in range(1, self.max_words + 1)]
+
+ return list(itertools.chain(
+ PRE_WORDS_HEADERS,
+ word_headers,
+ POST_WORDS_HEADERS
+ ))
+
+ def _uttlevel_row(self, id: int, words: List[str]) -> List[str]:
+ '''Create utterance level row'''
+ pre_word_values = [id, SAF_UTT_LEVEL, None] # Unaligned
+ word_values = ljust(words, self.max_words)
+ post_word_values = [None, None] # Fases, Commentaar
+ return list(itertools.chain(
+ pre_word_values,
+ word_values,
+ post_word_values
+ ))
+
+ def _make_levels_rows(self, ws: Worksheet) -> None:
+ '''Create rows for all utterances, all levels'''
+ row_size = len(self.anno_headers)
+ all_levels = self.method_category.levels + [SAF_COMMENT_LEVEL]
+
+ for utt_id, words in natsorted(self.results.allutts.items(),
+ key=lambda x: x[0]):
+ ws.append(self._uttlevel_row(utt_id, words))
+ for level in all_levels:
+ level_row = ljust([utt_id, level], row_size)
+ ws.append(level_row)
+
+ def _fill_query(self, query_id: ResultsKey, exact_results: ExactResults):
+ '''Find and fill all cells for a single query'''
+ lemma_item = None
+ if isinstance(query_id, Tuple) and not query_id[0] == query_id[1]:
+ # Lemma queries hold the lemma in second position
+ lemma_item = query_id[1]
+
+ simple_query_id = query_id[0]
+ query = self.method.queries.get(simple_query_id)
+ item = lemma_item or query.item
+ fase = query.fase
+
+ for utt_id, word_nr in exact_results:
+ # We cannot assume that utterances are numbered 1-N sequentially
+ try:
+ utt_nr = list(self.results.allutts.keys()).index(utt_id)
+ except ValueError:
+ utt_nr = list(self.results.allutts.keys()).index(int(utt_id))
+ row, col = self._cell_location(utt_nr, query.level, word_nr)
+ cell = self.anno_ws.cell(row, col)
+ self._append_item(cell, item)
+ if fase:
+ self._append_fase(row, str(fase))
+
+ def _cell_location(self, utt_nr: int, level: str,
+ word_nr: int) -> Tuple[int, int]:
+ '''Find the coordinates of a cell'''
+ return (
+ self._uttlevel_row_number(utt_nr, level),
+ self._word_col_number(word_nr)
+ )
+
+ def _uttlevel_row_number(self, utt_nr: int, level: str) -> int:
+ '''Calculate the row number for level of utterance (1 indexed)'''
+ total = 1 # header and 1-indexed offsets
+ utt_offset = (utt_nr * self.utt_n_rows) + 1
+ level_offset = self.level_offsets.get(level.lower(), 0)
+ total += utt_offset + level_offset
+ return total
+
+ def _word_col_number(self, word_nr: int) -> int:
+ '''Calculate the column number for a word'''
+ return word_nr + len(PRE_WORDS_HEADERS)
+
+ def _append_item(self, cell: Cell, item: str) -> None:
+ cell.value = item if not cell.value else f'{cell.value}, {item}'
+
+ def _append_fase(self, row: int, fase: str) -> None:
+ fase_cell = self.anno_ws.cell(row, len(self.anno_headers) - 1)
+ sep = ', '
+ if not fase_cell.value:
+ fase_cell.value = fase
+ else:
+ current = set(fase_cell.value.split(sep))
+ current.add(fase)
+ new = sep.join(sorted(list(current)))
+ fase_cell.value = new
diff --git a/backend/anonymization.json b/backend/anonymization.json
deleted file mode 100644
index 60edeec1..00000000
--- a/backend/anonymization.json
+++ /dev/null
@@ -1,49 +0,0 @@
-[
- {
- "category": "place",
- "codes": ["PLAATS", "PLAATSNAAM", "WOONPLAATS"],
- "common": ["Utrecht", "Breda", "Leiden", "Maastricht", "Arnhem"]
- },
- {
- "category": "lastname",
- "codes": ["ACHTERNAAM"],
- "common": ["Jansen", "Hendriks", "Dekker", "Dijkstra", "Veenstra"]
- },
- {
- "category": "person",
- "codes": ["NAAM", "BROER", "ZUS", "KIND", "VADER", "MOEDER"],
- "common": ["Maria", "Jan", "Anna", "Esther", "Pieter", "Sam"]
- },
- {
- "category": "profession",
- "codes": ["BEROEP"],
- "common": ["timmerman", "chirurgh", "leraar", "ober", "verslaggever"]
- },
- {
- "category": "country",
- "codes": ["LAND"],
- "common": ["Duitsland", "Nederland", "Japan", "Kameroen", "India"]
- },
- {
- "category": "education",
- "codes": ["STUDIE", "OPLEIDING"],
- "common": [
- "bedrijfskunde",
- "informatica",
- "filosofie",
- "rechtsgeleerdheid",
- "werktuigbouwkunde"
- ]
- },
- {
- "category": "institution",
- "codes": ["ZORGINSTELLING", "INSTELLING", "ZIEKENHUIS"],
- "common": [
- "Diakonessenhuis",
- "Rijnstate",
- "Vogellanden",
- "HagaZiekenhuis",
- "Slingeland"
- ]
- }
-]
diff --git a/backend/conftest.py b/backend/conftest.py
new file mode 100644
index 00000000..66f4992c
--- /dev/null
+++ b/backend/conftest.py
@@ -0,0 +1,115 @@
+import glob
+from collections import Counter
+from os import path as op
+
+import pytest
+from analysis.models import AssessmentMethod, MethodCategory
+from django.conf import settings
+from django.core.files import File
+from sastadev.allresults import AllResults
+from sastadev.conf import settings as sd_settings
+
+from lxml import etree
+
+
+@pytest.fixture
+def cha_testfiles_dir():
+ return op.join(settings.BASE_DIR, 'test_files')
+
+
+@pytest.fixture
+def tarsp_category(db):
+ obj = MethodCategory.objects.create(
+ name='TARSP', zc_embeddings=True,
+ levels=['Sz', 'Zc', 'Wg', 'VVW'],
+ marking_postcodes=['[+ G]'])
+ yield obj
+ obj.delete()
+
+
+@pytest.fixture
+def stap_category(db):
+ obj = MethodCategory.objects.create(
+ name='STAP', zc_embeddings=False,
+ levels=['Complexiteit', 'Grammaticale fout'],
+ marking_postcodes=['[+ G]', '[+ VU]'])
+ yield obj
+ obj.delete()
+
+
+@pytest.fixture
+def asta_category(db):
+ obj = MethodCategory.objects.create(
+ name='ASTA', zc_embeddings=False, levels=[
+ "Samplegrootte",
+ "MLU",
+ "Taalmaat",
+ "Foutenanalyse",
+ "Lemma"
+ ], marking_postcodes=["[+ G]"])
+ yield obj
+ obj.delete()
+
+
+@pytest.fixture
+def method_dir():
+ return op.join(sd_settings.SD_DIR, 'data', 'methods')
+
+
+@pytest.fixture
+def tarsp_method(db, tarsp_category, method_dir):
+ file = glob.glob(f'{method_dir}/TARSP Index Current.xlsx')[0]
+ with open(file, 'rb') as f:
+ wrapped_file = File(f)
+ instance = AssessmentMethod(
+ name='tarsp_test_method', category=tarsp_category)
+ instance.content.save(op.basename(file), wrapped_file)
+ yield instance
+ instance.delete()
+
+
+@pytest.fixture
+def asta_method(db, asta_category, method_dir):
+ file = glob.glob(f'{method_dir}/ASTA_Index_Current.xlsx')[0]
+ with open(file, 'rb') as f:
+ wrapped_file = File(f)
+ instance = AssessmentMethod(
+ name='asta_test_method', category=asta_category)
+ instance.content.save(op.basename(file), wrapped_file)
+ yield instance
+ instance.delete()
+
+
+@pytest.fixture
+def single_utt_allresults(cha_testfiles_dir):
+ parsed = etree.parse(
+ op.join(cha_testfiles_dir, 'single_utt_corrected.xml'))
+ utts = parsed.xpath('alpino_ds')
+
+ return AllResults(
+ uttcount=2,
+ coreresults={('A029', 'A029'): Counter({'1': 1}), ('A045', 'A045'): Counter({'1': 1}),
+ ('A001', 'A001'): Counter({'1': 1}), ('A003', 'A003'): Counter({'1': 2}),
+ ('A013', 'A013'): Counter({'1': 1}), ('A018', 'A018'): Counter({'1': 2}),
+ ('A021', 'A021'): Counter({'1': 2}), ('A024', 'A024'): Counter({'1': 2}),
+ ('A051', 'beet'): Counter({'1': 1}), ('A051', 'vertellen'): Counter({'1': 1}),
+ ('A051', 'ongeluk'): Counter({'1': 1}), ('A051', 'hebben'): Counter({'1': 1})},
+
+ exactresults={('A029', 'A029'): [('1', 1)], ('A045', 'A045'): [('1', 2)],
+ ('A001', 'A001'): [('1', 7)], ('A003', 'A003'): [('1', 8), ('1', 13)],
+ ('A013', 'A013'): [('1', 4)], ('A018', 'A018'): [('1', 12), ('1', 18)],
+ ('A021', 'A021'): [('1', 6), ('1', 17)], ('A024', 'A024'): [('1', 4), ('1', 15)],
+ ('A051', 'beet'): [('1', 6)], ('A051', 'vertellen'): [('1', 12)],
+ ('A051', 'ongeluk'): [('1', 17)], ('A051', 'hebben'): [('1', 18)],
+ },
+ postresults={'A046': Counter({('beet', '1'): 1, ('ongeluk', '1'): 1}),
+ 'A049': Counter({('vertellen', '1'): 1, ('hebben', '1'): 1})},
+ allmatches=None, # Not provided in this fixture
+ filename='single_utt',
+ analysedtrees=[(n + 1, tree) for n, tree in enumerate(utts)],
+ annotationinput=True,
+ allutts={1: ['ja', 'uh', 'ik', 'vind', 'het', 'beetje', 'moeilijk',
+ 'om', 'het', 'goed', 'te', 'vertellen', 'want', 'ik',
+ 'heb', 'een', 'ongeluk', 'gehad']}
+
+ )
diff --git a/backend/convert/chat_reader_test.py b/backend/convert/chat_reader_test.py
index d1f9c851..36a8fbda 100644
--- a/backend/convert/chat_reader_test.py
+++ b/backend/convert/chat_reader_test.py
@@ -1,14 +1,12 @@
import os.path as op
import pytest
-from analysis.conftest import stap_category, tarsp_category
from .chat_reader import ChatDocument
from .conftest import TEST_DIR
@pytest.mark.django_db
-@pytest.mark.usefixtures("tarsp_category")
def test_chat_reader(chafiles, tarsp_category):
for (input, _) in chafiles:
doc = ChatDocument.from_chatfile(input, tarsp_category)
@@ -16,7 +14,6 @@ def test_chat_reader(chafiles, tarsp_category):
@pytest.mark.django_db
-@pytest.mark.usefixtures("tarsp_category", "stap_category")
def test_marking_postcodes(chafiles, tarsp_category, stap_category):
inf = op.join(TEST_DIR, 'TD16.cha')
diff --git a/backend/convert/chat_writer_test.py b/backend/convert/chat_writer_test.py
index 997453a0..9c7532e2 100644
--- a/backend/convert/chat_writer_test.py
+++ b/backend/convert/chat_writer_test.py
@@ -1,12 +1,10 @@
import pytest
-from analysis.conftest import tarsp_category
from .chat_reader import ChatDocument
from .chat_writer import ChatWriter
@pytest.mark.django_db
-@pytest.mark.usefixtures("tarsp_category")
def test_chat_writer(chafiles, tarsp_category):
for inpath, outpath in chafiles:
doc = ChatDocument.from_chatfile(inpath, tarsp_category)
diff --git a/backend/requirements.in b/backend/requirements.in
index f54d7d0c..70612e33 100644
--- a/backend/requirements.in
+++ b/backend/requirements.in
@@ -8,6 +8,7 @@ django-livereload-server
django-rest-auth[with_social]
django-revproxy>=0.9.16
lxml==4.9.1
+natsort
numpy<1.22
pandas==1.3.*
psycopg2
diff --git a/backend/requirements.txt b/backend/requirements.txt
index 0ceeccea..77541270 100644
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -122,6 +122,8 @@ murmurhash==1.0.7
# preshed
# spacy
# thinc
+natsort==8.4.0
+ # via -r requirements.in
numpy==1.21.6
# via
# -r requirements.in
@@ -210,7 +212,7 @@ requests==2.28.1
# spacy
requests-oauthlib==1.3.1
# via django-allauth
-sastadev==0.1.5
+sastadev==0.2.0
# via
# -r requirements.in
# auchann
diff --git a/backend/sasta/common_settings.py b/backend/sasta/common_settings.py
index bc42acbc..134b191a 100644
--- a/backend/sasta/common_settings.py
+++ b/backend/sasta/common_settings.py
@@ -25,7 +25,7 @@
'authentication',
'parse',
'convert',
- 'sastadev'
+ 'annotations'
]
MIDDLEWARE = [
diff --git a/backend/setup.cfg b/backend/setup.cfg
index 6b8a7c62..f6e96ba2 100644
--- a/backend/setup.cfg
+++ b/backend/setup.cfg
@@ -11,7 +11,7 @@ ignore_missing_imports = True
exclude =
sastadev
.env
-ignore = E501, W503
+ignore = E501, W503
max-complexity = 10
# pytest fixtures need this funky import stye, hence the ignore F401 and F811
per-file-ignores =
diff --git a/backend/test_files/single_utt.cha b/backend/test_files/single_utt.cha
new file mode 100644
index 00000000..774e30c9
--- /dev/null
+++ b/backend/test_files/single_utt.cha
@@ -0,0 +1,11 @@
+@UTF8
+@Begin
+@Languages: nld
+@Participants: PMA pma Other, INV inv Other
+@ID: nld||PMA|||||Other|||
+@ID: nld||INV|||||Other|||
+@Comment: ##META text samplenaam = ASTA-16
+
+*INV: Kun u mij eens wat vertellen waarom u hier bent wat is er gebeurd
+*PMA: ja uh ik vind het beetje moeilijk om het goed te vertellen want ik heb een ongeluk gehad
+%xsid: 1
diff --git a/backend/test_files/single_utt.xml b/backend/test_files/single_utt.xml
new file mode 100644
index 00000000..5c98c69b
--- /dev/null
+++ b/backend/test_files/single_utt.xml
@@ -0,0 +1,133 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Kun u mij eens wat vertellen waarom u hier bent wat is er gebeurd
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ja uh ik vind het beetje moeilijk om het goed te vertellen want ik heb een ongeluk gehad
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/backend/test_files/single_utt_corrected.xml b/backend/test_files/single_utt_corrected.xml
new file mode 100644
index 00000000..a10d7eb2
--- /dev/null
+++ b/backend/test_files/single_utt_corrected.xml
@@ -0,0 +1,129 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Kun u mij eens wat vertellen waarom u hier bent wat is er gebeurd
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ja uh ik vind het beetje moeilijk om het goed te vertellen want ik heb een ongeluk gehad
+ Q#ng1704292912|ik vind het beetje moeilijk om het goed te vertellen want ik heb een ongeluk gehad|1|1|-12.662841177789993
+
+
\ No newline at end of file
diff --git a/backend/test_files/single_utt_saf.xlsx b/backend/test_files/single_utt_saf.xlsx
new file mode 100644
index 00000000..9046b16b
Binary files /dev/null and b/backend/test_files/single_utt_saf.xlsx differ
diff --git a/frontend/src/app/transcript/transcript.component.ts b/frontend/src/app/transcript/transcript.component.ts
index 68e93be2..cd1dcceb 100644
--- a/frontend/src/app/transcript/transcript.component.ts
+++ b/frontend/src/app/transcript/transcript.component.ts
@@ -23,6 +23,8 @@ import {
TranscriptService,
} from '@services';
+import _ from 'lodash';
+
const XLSX_MIME =
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet';
const TXT_MIME = 'text/plain';
@@ -72,15 +74,19 @@ export class TranscriptComponent implements OnInit, OnDestroy {
);
}
+ hasLatestRun(): boolean {
+ return !_.isNil(this.transcript.latest_run);
+ }
+
allowCorrectionUpload(): boolean {
return (
this.transcript.status === TranscriptStatus.PARSED &&
- this.transcript.latest_run !== undefined
+ this.hasLatestRun()
);
}
allowCorrectionReset(): boolean {
- return this.transcript.latest_run !== undefined;
+ return this.hasLatestRun();
}
allowScoring(): boolean {
diff --git a/frontend/src/environments/version.ts b/frontend/src/environments/version.ts
index a69c88f7..40fc290e 100644
--- a/frontend/src/environments/version.ts
+++ b/frontend/src/environments/version.ts
@@ -1,2 +1,2 @@
// TODO: Compile this from toplevel package.json
-export const version = '0.8.2';
+export const version = '0.9.0';
diff --git a/package.json b/package.json
index 74886011..cfafca77 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
{
"name": "sasta",
- "version": "0.8.2",
+ "version": "0.9.0",
"description": "Annotate and analyze transcripts",
"author": "UU Digital Humanities Lab",
"license": "BSD-3-Clause",