From dc0b7925b514367479e30e859a941f022d72ae37 Mon Sep 17 00:00:00 2001 From: bjascob Date: Sat, 27 Nov 2021 10:36:30 -0700 Subject: [PATCH] update tests for parse models --- .../02_Build_Aligment_Test_Corpus.py | 49 +++++++++++++++ tests/auto/ModelParseSPRING.py | 63 +++++++++++++++++++ tests/auto/ModelParseT5v1.py | 63 +++++++++++++++++++ tests/auto/ModelParseT5v2.py | 63 +++++++++++++++++++ .../Test_RBW_Alignment_String_Generation.py | 5 +- 5 files changed, 242 insertions(+), 1 deletion(-) create mode 100755 scripts/60_RBW_Aligner/02_Build_Aligment_Test_Corpus.py create mode 100644 tests/auto/ModelParseSPRING.py create mode 100644 tests/auto/ModelParseT5v1.py create mode 100644 tests/auto/ModelParseT5v2.py diff --git a/scripts/60_RBW_Aligner/02_Build_Aligment_Test_Corpus.py b/scripts/60_RBW_Aligner/02_Build_Aligment_Test_Corpus.py new file mode 100755 index 0000000..8075bf3 --- /dev/null +++ b/scripts/60_RBW_Aligner/02_Build_Aligment_Test_Corpus.py @@ -0,0 +1,49 @@ +#!/usr/bin/python3 +import setup_run_dir # Set the working directory and python sys.path to 2 levels above +import os +import penman +from amrlib.graph_processing.amr_loading_raw import load_raw_amr +from amrlib.alignments.penman_utils import test_for_decode_encode_issue, strip_surface_alignments + + +# Get rid of un-needed metadata and rename "alignments", "isi_alignments" +def mod_graph_meta(graph): + id = graph.metadata['id'] + tok = graph.metadata['tok'] + aligns = graph.metadata['alignments'] + graph.metadata = {'id':id, 'tok':tok, 'isi_alignments':aligns} + return graph + + +# Build a corpus of test cases for alignments +if __name__ == '__main__': + corp_dir = 'amrlib/data/amr_annotation_3.0/data/alignments/split/test' + graph_fn = 'amrlib/data/alignments/test_w_surface.txt' + graph_ns_fn = 'amrlib/data/alignments/test_no_surface.txt' + + os.makedirs(os.path.dirname(graph_fn), exist_ok=True) + + # Loop through the files and load all entries + entries = [] + print('Loading data from', corp_dir) + fpaths = [os.path.join(corp_dir, fn) for fn in os.listdir(corp_dir)] + for fpath in fpaths: + entries += load_raw_amr(fpath) + print('Loaded {:,} entries'.format(len(entries))) + + # Check for the penman decode/re-encode issue and strip some metadata + good_graphs = [] + good_graphs_ns = [] + for entry in entries: + # Create a version with No Surface alignments + entry_ns = strip_surface_alignments(entry) + graph, is_good = test_for_decode_encode_issue(entry) + graph_ns, is_good_ns = test_for_decode_encode_issue(entry_ns) + if is_good and is_good_ns: + good_graphs.append( mod_graph_meta(graph) ) + good_graphs_ns.append( mod_graph_meta(graph_ns) ) + + # Save the collated data + print('Saving {:,} good graphs to {:} and {:}'.format(len(good_graphs), graph_fn, graph_ns_fn)) + penman.dump(good_graphs, graph_fn, indent=6) + penman.dump(good_graphs_ns, graph_ns_fn, indent=6) diff --git a/tests/auto/ModelParseSPRING.py b/tests/auto/ModelParseSPRING.py new file mode 100644 index 0000000..06acca7 --- /dev/null +++ b/tests/auto/ModelParseSPRING.py @@ -0,0 +1,63 @@ +#!/usr/bin/python3 +import os +import sys +sys.path.insert(0, '../..') # make '..' first in the lib search path +import logging +import unittest +import spacy +import amrlib +from amrlib.defaults import data_dir + +# Base classes and relative imports are proving to be problematic so for now, simply copy the code. + +# UnitTest creates a separate instance of the class for each test in it so __init__ gets called +# a bunch of times. However, they all seem to run in the same process so globals are shared. +# To avoid loading Spacy multiple times cache it in a global variable. +# For the stog_model, amrlib caches this and since there is only one process it will stay in-memory +# across all unit tests (even ones in other files when run with RunAllUnitTests.py) until explicity +# reloaded with amrlib.load_stog_model(model_dir). +# When the spacy extensions are called the they check to see if a global stog_model is not None, and +# only call the loader if it's not already loaded. +SPRING_LOADED = None # one-shot to assure amrlib.stog_model is reloaded with this specific model +SPACY_NLP = None +class ModelParseSPRING(unittest.TestCase): + model_dir = os.path.join(data_dir, 'model_parse_spring-v0_1_0') + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + amrlib.setup_spacy_extension() + # Load/cache spacy + global SPACY_NLP + if SPACY_NLP is None: + SPACY_NLP = spacy.load('en_core_web_sm') + self.nlp = SPACY_NLP + # Load model in amrlib (amrlib will cache this itself) + global SPRING_LOADED + if SPRING_LOADED is None: + print('Loading', self.model_dir) + amrlib.load_stog_model(model_dir=self.model_dir) + SPRING_LOADED = True + self.stog = amrlib.stog_model + + def testStoG(self): + graphs = self.stog.parse_sents(['This is a test of the system.']) + self.assertEqual(len(graphs), 1) + + def testSpaCyDoc(self): + doc = self.nlp('This is a test of the SpaCy extension. The test has multiple sentence') + graphs = doc._.to_amr() + self.assertEqual(len(graphs), 2) + + def testSpaCySpan(self): + doc = self.nlp('This is a test of the SpaCy extension. The test has multiple sentence') + span = list(doc.sents)[0] # first sentence only + graphs = span._.to_amr() + self.assertEqual(len(graphs), 1) + + +if __name__ == '__main__': + level = logging.WARNING + format = '[%(levelname)s %(filename)s ln=%(lineno)s] %(message)s' + logging.basicConfig(level=level, format=format) + + # run all methods that start with 'test' + unittest.main() diff --git a/tests/auto/ModelParseT5v1.py b/tests/auto/ModelParseT5v1.py new file mode 100644 index 0000000..bee18dd --- /dev/null +++ b/tests/auto/ModelParseT5v1.py @@ -0,0 +1,63 @@ +#!/usr/bin/python3 +import os +import sys +sys.path.insert(0, '../..') # make '..' first in the lib search path +import logging +import unittest +import spacy +import amrlib +from amrlib.defaults import data_dir + +# Base classes and relative imports are proving to be problematic so for now, simply copy the code. + +# UnitTest creates a separate instance of the class for each test in it so __init__ gets called +# a bunch of times. However, they all seem to run in the same process so globals are shared. +# To avoid loading Spacy multiple times cache it in a global variable. +# For the stog_model, amrlib caches this and since there is only one process it will stay in-memory +# across all unit tests (even ones in other files when run with RunAllUnitTests.py) until explicity +# reloaded with amrlib.load_stog_model(model_dir). +# When the spacy extensions are called the they check to see if a global stog_model is not None, and +# only call the loader if it's not already loaded. +T5V1_LOADED = None # one-shot to assure amrlib.stog_model is reloaded with this specific model +SPACY_NLP = None +class ModelParseT5v1(unittest.TestCase): + model_dir = os.path.join(data_dir, 'model_parse_t5-v0_1_0') + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + amrlib.setup_spacy_extension() + # Load/cache spacy + global SPACY_NLP + if SPACY_NLP is None: + SPACY_NLP = spacy.load('en_core_web_sm') + self.nlp = SPACY_NLP + # Load model in amrlib (amrlib will cache this itself) + global T5V1_LOADED + if T5V1_LOADED is None: + print('Loading', self.model_dir) #rbf + amrlib.load_stog_model(model_dir=self.model_dir) + T5V1_LOADED = True + self.stog = amrlib.stog_model + + def testStoG(self): + graphs = self.stog.parse_sents(['This is a test of the system.']) + self.assertEqual(len(graphs), 1) + + def testSpaCyDoc(self): + doc = self.nlp('This is a test of the SpaCy extension. The test has multiple sentence') + graphs = doc._.to_amr() + self.assertEqual(len(graphs), 2) + + def testSpaCySpan(self): + doc = self.nlp('This is a test of the SpaCy extension. The test has multiple sentence') + span = list(doc.sents)[0] # first sentence only + graphs = span._.to_amr() + self.assertEqual(len(graphs), 1) + + +if __name__ == '__main__': + level = logging.WARNING + format = '[%(levelname)s %(filename)s ln=%(lineno)s] %(message)s' + logging.basicConfig(level=level, format=format) + + # run all methods that start with 'test' + unittest.main() diff --git a/tests/auto/ModelParseT5v2.py b/tests/auto/ModelParseT5v2.py new file mode 100644 index 0000000..bba3b01 --- /dev/null +++ b/tests/auto/ModelParseT5v2.py @@ -0,0 +1,63 @@ +#!/usr/bin/python3 +import os +import sys +sys.path.insert(0, '../..') # make '..' first in the lib search path +import logging +import unittest +import spacy +import amrlib +from amrlib.defaults import data_dir + +# Base classes and relative imports are proving to be problematic so for now, simply copy the code. + +# UnitTest creates a separate instance of the class for each test in it so __init__ gets called +# a bunch of times. However, they all seem to run in the same process so globals are shared. +# To avoid loading Spacy multiple times cache it in a global variable. +# For the stog_model, amrlib caches this and since there is only one process it will stay in-memory +# across all unit tests (even ones in other files when run with RunAllUnitTests.py) until explicity +# reloaded with amrlib.load_stog_model(model_dir). +# When the spacy extensions are called the they check to see if a global stog_model is not None, and +# only call the loader if it's not already loaded. +T5V2_LOADED = None # one-shot to assure amrlib.stog_model is reloaded with this specific model +SPACY_NLP = None +class ModelParseT5v2(unittest.TestCase): + model_dir = os.path.join(data_dir, 'model_parse_t5-v0_2_0') + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + amrlib.setup_spacy_extension() + # Load/cache spacy + global SPACY_NLP + if SPACY_NLP is None: + SPACY_NLP = spacy.load('en_core_web_sm') + self.nlp = SPACY_NLP + # Load model in amrlib (amrlib will cache this itself) + global T5V2_LOADED + if T5V2_LOADED is None: + print('Loading', self.model_dir) + amrlib.load_stog_model(model_dir=self.model_dir) + T5V2_LOADED = True + self.stog = amrlib.stog_model + + def testStoG(self): + graphs = self.stog.parse_sents(['This is a test of the system.']) + self.assertEqual(len(graphs), 1) + + def testSpaCyDoc(self): + doc = self.nlp('This is a test of the SpaCy extension. The test has multiple sentence') + graphs = doc._.to_amr() + self.assertEqual(len(graphs), 2) + + def testSpaCySpan(self): + doc = self.nlp('This is a test of the SpaCy extension. The test has multiple sentence') + span = list(doc.sents)[0] # first sentence only + graphs = span._.to_amr() + self.assertEqual(len(graphs), 1) + + +if __name__ == '__main__': + level = logging.WARNING + format = '[%(levelname)s %(filename)s ln=%(lineno)s] %(message)s' + logging.basicConfig(level=level, format=format) + + # run all methods that start with 'test' + unittest.main() diff --git a/tests/manual/Test_RBW_Alignment_String_Generation.py b/tests/manual/Test_RBW_Alignment_String_Generation.py index 368eed4..d563245 100755 --- a/tests/manual/Test_RBW_Alignment_String_Generation.py +++ b/tests/manual/Test_RBW_Alignment_String_Generation.py @@ -8,9 +8,12 @@ from amrlib.alignments.penman_utils import test_for_decode_encode_issue +# 11/27/2021: This test is currently broken + + # Manual test to see if amrlib can generate the alignment string from surface alignments correctly, # using the LDC data as the baseline -# !! Note that you must first create the test corpus. See the scripts directory for this +# !! Note that you must first create the test corpus. See the scripts directory/Build_Aligment_Test_Corpus.py if __name__ == '__main__': fname = 'amrlib/data/alignments/test_w_surface.txt' entries = load_amr_entries(fname)