From dc0b7925b514367479e30e859a941f022d72ae37 Mon Sep 17 00:00:00 2001
From: bjascob <bjascob@msn.com>
Date: Sat, 27 Nov 2021 10:36:30 -0700
Subject: [PATCH] update tests for parse models

---
 .../02_Build_Aligment_Test_Corpus.py          | 49 +++++++++++++++
 tests/auto/ModelParseSPRING.py                | 63 +++++++++++++++++++
 tests/auto/ModelParseT5v1.py                  | 63 +++++++++++++++++++
 tests/auto/ModelParseT5v2.py                  | 63 +++++++++++++++++++
 .../Test_RBW_Alignment_String_Generation.py   |  5 +-
 5 files changed, 242 insertions(+), 1 deletion(-)
 create mode 100755 scripts/60_RBW_Aligner/02_Build_Aligment_Test_Corpus.py
 create mode 100644 tests/auto/ModelParseSPRING.py
 create mode 100644 tests/auto/ModelParseT5v1.py
 create mode 100644 tests/auto/ModelParseT5v2.py

diff --git a/scripts/60_RBW_Aligner/02_Build_Aligment_Test_Corpus.py b/scripts/60_RBW_Aligner/02_Build_Aligment_Test_Corpus.py
new file mode 100755
index 0000000..8075bf3
--- /dev/null
+++ b/scripts/60_RBW_Aligner/02_Build_Aligment_Test_Corpus.py
@@ -0,0 +1,49 @@
+#!/usr/bin/python3
+import setup_run_dir    # Set the working directory and python sys.path to 2 levels above
+import os
+import penman
+from   amrlib.graph_processing.amr_loading_raw import load_raw_amr
+from   amrlib.alignments.penman_utils import test_for_decode_encode_issue, strip_surface_alignments
+
+
+# Get rid of un-needed metadata and rename "alignments", "isi_alignments"
+def mod_graph_meta(graph):
+    id     = graph.metadata['id']
+    tok    = graph.metadata['tok']
+    aligns = graph.metadata['alignments']
+    graph.metadata = {'id':id, 'tok':tok, 'isi_alignments':aligns}
+    return graph
+
+
+# Build a corpus of test cases for alignments
+if __name__ == '__main__':
+    corp_dir    = 'amrlib/data/amr_annotation_3.0/data/alignments/split/test'
+    graph_fn    = 'amrlib/data/alignments/test_w_surface.txt'
+    graph_ns_fn = 'amrlib/data/alignments/test_no_surface.txt'
+
+    os.makedirs(os.path.dirname(graph_fn), exist_ok=True)
+
+    # Loop through the files and load all entries
+    entries = []
+    print('Loading data from', corp_dir)
+    fpaths = [os.path.join(corp_dir, fn) for fn in os.listdir(corp_dir)]
+    for fpath in fpaths:
+        entries += load_raw_amr(fpath)
+    print('Loaded {:,} entries'.format(len(entries)))
+
+    # Check for the penman decode/re-encode issue and strip some metadata
+    good_graphs    = []
+    good_graphs_ns = []
+    for entry in entries:
+        # Create a version with No Surface alignments
+        entry_ns = strip_surface_alignments(entry)
+        graph,    is_good    = test_for_decode_encode_issue(entry)
+        graph_ns, is_good_ns = test_for_decode_encode_issue(entry_ns)
+        if is_good and is_good_ns:
+            good_graphs.append( mod_graph_meta(graph) )
+            good_graphs_ns.append( mod_graph_meta(graph_ns) )
+
+    # Save the collated data
+    print('Saving {:,} good graphs to {:} and {:}'.format(len(good_graphs), graph_fn, graph_ns_fn))
+    penman.dump(good_graphs,    graph_fn,    indent=6)
+    penman.dump(good_graphs_ns, graph_ns_fn, indent=6)
diff --git a/tests/auto/ModelParseSPRING.py b/tests/auto/ModelParseSPRING.py
new file mode 100644
index 0000000..06acca7
--- /dev/null
+++ b/tests/auto/ModelParseSPRING.py
@@ -0,0 +1,63 @@
+#!/usr/bin/python3
+import os
+import sys
+sys.path.insert(0, '../..')    # make '..' first in the lib search path
+import logging
+import unittest
+import spacy
+import amrlib
+from   amrlib.defaults import data_dir
+
+# Base classes and relative imports are proving to be problematic so for now, simply copy the code.
+
+# UnitTest creates a separate instance of the class for each test in it so __init__ gets called
+# a bunch of times. However, they all seem to run in the same process so globals are shared.
+# To avoid loading Spacy multiple times cache it in a global variable.
+# For the stog_model, amrlib caches this and since there is only one process it will stay in-memory
+# across all unit tests (even ones in other files when run with RunAllUnitTests.py) until explicity
+# reloaded with amrlib.load_stog_model(model_dir).
+# When the spacy extensions are called the they check to see if a global stog_model is not None, and
+# only call the loader if it's not already loaded.
+SPRING_LOADED = None      # one-shot to assure amrlib.stog_model is reloaded with this specific model
+SPACY_NLP     = None
+class ModelParseSPRING(unittest.TestCase):
+    model_dir = os.path.join(data_dir, 'model_parse_spring-v0_1_0')
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        amrlib.setup_spacy_extension()
+        # Load/cache spacy
+        global SPACY_NLP
+        if SPACY_NLP is None:
+            SPACY_NLP = spacy.load('en_core_web_sm')
+        self.nlp  = SPACY_NLP
+        # Load model in amrlib (amrlib will cache this itself)
+        global SPRING_LOADED
+        if SPRING_LOADED is None:
+            print('Loading', self.model_dir)
+            amrlib.load_stog_model(model_dir=self.model_dir)
+            SPRING_LOADED = True
+        self.stog = amrlib.stog_model
+
+    def testStoG(self):
+        graphs = self.stog.parse_sents(['This is a test of the system.'])
+        self.assertEqual(len(graphs), 1)
+
+    def testSpaCyDoc(self):
+        doc = self.nlp('This is a test of the SpaCy extension.  The test has multiple sentence')
+        graphs = doc._.to_amr()
+        self.assertEqual(len(graphs), 2)
+
+    def testSpaCySpan(self):
+        doc = self.nlp('This is a test of the SpaCy extension.  The test has multiple sentence')
+        span = list(doc.sents)[0]   # first sentence only
+        graphs = span._.to_amr()
+        self.assertEqual(len(graphs), 1)
+
+
+if __name__ == '__main__':
+    level  = logging.WARNING
+    format = '[%(levelname)s %(filename)s ln=%(lineno)s] %(message)s'
+    logging.basicConfig(level=level, format=format)
+
+    # run all methods that start with 'test'
+    unittest.main()
diff --git a/tests/auto/ModelParseT5v1.py b/tests/auto/ModelParseT5v1.py
new file mode 100644
index 0000000..bee18dd
--- /dev/null
+++ b/tests/auto/ModelParseT5v1.py
@@ -0,0 +1,63 @@
+#!/usr/bin/python3
+import os
+import sys
+sys.path.insert(0, '../..')    # make '..' first in the lib search path
+import logging
+import unittest
+import spacy
+import amrlib
+from   amrlib.defaults import data_dir
+
+# Base classes and relative imports are proving to be problematic so for now, simply copy the code.
+
+# UnitTest creates a separate instance of the class for each test in it so __init__ gets called
+# a bunch of times. However, they all seem to run in the same process so globals are shared.
+# To avoid loading Spacy multiple times cache it in a global variable.
+# For the stog_model, amrlib caches this and since there is only one process it will stay in-memory
+# across all unit tests (even ones in other files when run with RunAllUnitTests.py) until explicity
+# reloaded with amrlib.load_stog_model(model_dir).
+# When the spacy extensions are called the they check to see if a global stog_model is not None, and
+# only call the loader if it's not already loaded.
+T5V1_LOADED = None      # one-shot to assure amrlib.stog_model is reloaded with this specific model
+SPACY_NLP   = None
+class ModelParseT5v1(unittest.TestCase):
+    model_dir = os.path.join(data_dir, 'model_parse_t5-v0_1_0')
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        amrlib.setup_spacy_extension()
+        # Load/cache spacy
+        global SPACY_NLP
+        if SPACY_NLP is None:
+            SPACY_NLP = spacy.load('en_core_web_sm')
+        self.nlp  = SPACY_NLP
+        # Load model in amrlib (amrlib will cache this itself)
+        global T5V1_LOADED
+        if T5V1_LOADED is None:
+            print('Loading', self.model_dir)    #rbf
+            amrlib.load_stog_model(model_dir=self.model_dir)
+            T5V1_LOADED = True
+        self.stog = amrlib.stog_model
+
+    def testStoG(self):
+        graphs = self.stog.parse_sents(['This is a test of the system.'])
+        self.assertEqual(len(graphs), 1)
+
+    def testSpaCyDoc(self):
+        doc = self.nlp('This is a test of the SpaCy extension.  The test has multiple sentence')
+        graphs = doc._.to_amr()
+        self.assertEqual(len(graphs), 2)
+
+    def testSpaCySpan(self):
+        doc = self.nlp('This is a test of the SpaCy extension.  The test has multiple sentence')
+        span = list(doc.sents)[0]   # first sentence only
+        graphs = span._.to_amr()
+        self.assertEqual(len(graphs), 1)
+
+
+if __name__ == '__main__':
+    level  = logging.WARNING
+    format = '[%(levelname)s %(filename)s ln=%(lineno)s] %(message)s'
+    logging.basicConfig(level=level, format=format)
+
+    # run all methods that start with 'test'
+    unittest.main()
diff --git a/tests/auto/ModelParseT5v2.py b/tests/auto/ModelParseT5v2.py
new file mode 100644
index 0000000..bba3b01
--- /dev/null
+++ b/tests/auto/ModelParseT5v2.py
@@ -0,0 +1,63 @@
+#!/usr/bin/python3
+import os
+import sys
+sys.path.insert(0, '../..')    # make '..' first in the lib search path
+import logging
+import unittest
+import spacy
+import amrlib
+from   amrlib.defaults import data_dir
+
+# Base classes and relative imports are proving to be problematic so for now, simply copy the code.
+
+# UnitTest creates a separate instance of the class for each test in it so __init__ gets called
+# a bunch of times. However, they all seem to run in the same process so globals are shared.
+# To avoid loading Spacy multiple times cache it in a global variable.
+# For the stog_model, amrlib caches this and since there is only one process it will stay in-memory
+# across all unit tests (even ones in other files when run with RunAllUnitTests.py) until explicity
+# reloaded with amrlib.load_stog_model(model_dir).
+# When the spacy extensions are called the they check to see if a global stog_model is not None, and
+# only call the loader if it's not already loaded.
+T5V2_LOADED = None      # one-shot to assure amrlib.stog_model is reloaded with this specific model
+SPACY_NLP   = None
+class ModelParseT5v2(unittest.TestCase):
+    model_dir = os.path.join(data_dir, 'model_parse_t5-v0_2_0')
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        amrlib.setup_spacy_extension()
+        # Load/cache spacy
+        global SPACY_NLP
+        if SPACY_NLP is None:
+            SPACY_NLP = spacy.load('en_core_web_sm')
+        self.nlp  = SPACY_NLP
+        # Load model in amrlib (amrlib will cache this itself)
+        global T5V2_LOADED
+        if T5V2_LOADED is None:
+            print('Loading', self.model_dir)
+            amrlib.load_stog_model(model_dir=self.model_dir)
+            T5V2_LOADED = True
+        self.stog = amrlib.stog_model
+
+    def testStoG(self):
+        graphs = self.stog.parse_sents(['This is a test of the system.'])
+        self.assertEqual(len(graphs), 1)
+
+    def testSpaCyDoc(self):
+        doc = self.nlp('This is a test of the SpaCy extension.  The test has multiple sentence')
+        graphs = doc._.to_amr()
+        self.assertEqual(len(graphs), 2)
+
+    def testSpaCySpan(self):
+        doc = self.nlp('This is a test of the SpaCy extension.  The test has multiple sentence')
+        span = list(doc.sents)[0]   # first sentence only
+        graphs = span._.to_amr()
+        self.assertEqual(len(graphs), 1)
+
+
+if __name__ == '__main__':
+    level  = logging.WARNING
+    format = '[%(levelname)s %(filename)s ln=%(lineno)s] %(message)s'
+    logging.basicConfig(level=level, format=format)
+
+    # run all methods that start with 'test'
+    unittest.main()
diff --git a/tests/manual/Test_RBW_Alignment_String_Generation.py b/tests/manual/Test_RBW_Alignment_String_Generation.py
index 368eed4..d563245 100755
--- a/tests/manual/Test_RBW_Alignment_String_Generation.py
+++ b/tests/manual/Test_RBW_Alignment_String_Generation.py
@@ -8,9 +8,12 @@
 from   amrlib.alignments.penman_utils import test_for_decode_encode_issue
 
 
+# 11/27/2021: This test is currently broken
+
+
 # Manual test to see if amrlib can generate the alignment string from surface alignments correctly,
 # using the LDC data as the baseline
-# !! Note that you must first create the test corpus.  See the scripts directory for this
+# !! Note that you must first create the test corpus.  See the scripts directory/Build_Aligment_Test_Corpus.py
 if __name__ == '__main__':
     fname   = 'amrlib/data/alignments/test_w_surface.txt'
     entries = load_amr_entries(fname)