From c539cf1f3fe7e4fe23c441761f0aa273382c6547 Mon Sep 17 00:00:00 2001 From: GabrielKS <23368820+GabrielKS@users.noreply.github.com> Date: Thu, 19 Aug 2021 12:19:13 -0700 Subject: [PATCH 1/3] Trivially fix expected trip file leak bug + Fixes https://github.com/e-mission/e-mission-docs/issues/654 --- emission/analysis/userinput/expectations.py | 1 + 1 file changed, 1 insertion(+) diff --git a/emission/analysis/userinput/expectations.py b/emission/analysis/userinput/expectations.py index 646ae7bf2..b421b5a0b 100644 --- a/emission/analysis/userinput/expectations.py +++ b/emission/analysis/userinput/expectations.py @@ -45,6 +45,7 @@ def _process_and_save_trip(user_id, inferred_trip, ts): expected_trip["data"]["expectation"] = expectation expected_trip["data"]["confidence_threshold"] = confidence_threshold ts.insert(expected_trip) + return expected_trip # Fixes https://github.com/e-mission/e-mission-docs/issues/654 # This is a placeholder. TODO: implement the real algorithm def _get_expectation_for_trip(trip): From e2f48edae5c525c69051a2857728065796a74f08 Mon Sep 17 00:00:00 2001 From: GabrielKS <23368820+GabrielKS@users.noreply.github.com> Date: Thu, 19 Aug 2021 13:17:45 -0700 Subject: [PATCH 2/3] Add a test to prevent similar bugs in the future + Meaning bugs similar to https://github.com/e-mission/e-mission-docs/issues/654 Tested by ensuring it fails with the expectations bug active and passes with the expectations bug fixed. --- .../TestPipelineStageNonrepetition.py | 83 +++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 emission/tests/pipelineTests/TestPipelineStageNonrepetition.py diff --git a/emission/tests/pipelineTests/TestPipelineStageNonrepetition.py b/emission/tests/pipelineTests/TestPipelineStageNonrepetition.py new file mode 100644 index 000000000..7cc60f9a4 --- /dev/null +++ b/emission/tests/pipelineTests/TestPipelineStageNonrepetition.py @@ -0,0 +1,83 @@ +# https://github.com/e-mission/e-mission-docs/issues/654 shows that it's easy to mess up pipeline stages such that they act on trips they've already acted on. +# Let's write a test to make sure that doesn't happen again. +# Lots of boilerplate here copypasted from TestExpectationPipeline. TODO: refactor that out. + +import unittest +import numpy as np + +import emission.tests.common as etc +import emission.core.get_database as edb +import emission.core.wrapper.labelprediction as ecwl +import emission.analysis.classification.inference.labels.pipeline as eacilp +import emission.analysis.classification.inference.labels.inferrers as eacili +import emission.analysis.configs.expectation_notification_config as eace +import emission.analysis.userinput.expectations as eaue + +class TestExpectationPipeline(unittest.TestCase): + # The limitation of this approach is that one must manually add database keys here. + # So when implementing a new pipeline stage, put the relevant key(s) here! + # I've tried to go through the various existing pipeline stages and put some of their keys here, but this may already be incomplete. + # If a stage writes to the database with multiple keys, not all keys must necessarily be included. + keys_to_track = [ + "segmentation/raw_place", + "segmentation/raw_trip", + "segmentation/raw_stop", + "analysis/smoothing", + "analysis/cleaned_trip", + "analysis/inferred_section", + "analysis/inferred_trip", + "analysis/expected_trip", + "analysis/confirmed_trip" + ] + + def setUp(self): + self.test_options_stash = eace._test_options + eace._test_options = { + "use_sample": True, + "override_keylist": None + } + eace.reload_config() + + np.random.seed(61297777) + self.reset_all() + etc.setupRealExample(self, "emission/tests/data/real_examples/shankari_2015-07-22") + self.run_pipeline() + + def tearDown(self): + self.reset_all() + eace._test_options = self.test_options_stash + eace.reload_config() + + def run_pipeline(self, algorithms={ecwl.AlgorithmTypes.PLACEHOLDER_2: eacili.placeholder_predictor_2}): + primary_algorithms_stash = eacilp.primary_algorithms + eacilp.primary_algorithms = algorithms + test_options_stash = eaue._test_options + etc.runIntakePipeline(self.testUUID) # testUUID is set in setupRealExample + eacilp.primary_algorithms = primary_algorithms_stash + eaue._test_options = test_options_stash + + def reset_all(self): + etc.dropAllCollections(edb._get_current_db()) + + def count_keys(self): + counts = {} + db = edb.get_analysis_timeseries_db() + for key in self.keys_to_track: + count = db.find({"metadata.key": key, "user_id": self.testUUID}).count() + counts[key] = count + # print(counts) + return counts + + def testPipelineStageNonrepetition(self): + before_keys = self.count_keys() + self.run_pipeline() + after_keys = self.count_keys() + for key in self.keys_to_track: + self.assertEqual(before_keys[key], after_keys[key], key) + +def main(): + etc.configLogging() + unittest.main() + +if __name__ == "__main__": + main() From 903cac3956886414db8aa701d93b55f2edf719c3 Mon Sep 17 00:00:00 2001 From: GabrielKS <23368820+GabrielKS@users.noreply.github.com> Date: Thu, 19 Aug 2021 17:34:07 -0700 Subject: [PATCH 3/3] Fix small issues in TestPipelineStageNonrepetition + Rename the class from what I copypasted + Avoid using deprecated method --- .../tests/pipelineTests/TestPipelineStageNonrepetition.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/emission/tests/pipelineTests/TestPipelineStageNonrepetition.py b/emission/tests/pipelineTests/TestPipelineStageNonrepetition.py index 7cc60f9a4..1f4acd0ab 100644 --- a/emission/tests/pipelineTests/TestPipelineStageNonrepetition.py +++ b/emission/tests/pipelineTests/TestPipelineStageNonrepetition.py @@ -13,7 +13,7 @@ import emission.analysis.configs.expectation_notification_config as eace import emission.analysis.userinput.expectations as eaue -class TestExpectationPipeline(unittest.TestCase): +class TestPipelineStageNonrepetition(unittest.TestCase): # The limitation of this approach is that one must manually add database keys here. # So when implementing a new pipeline stage, put the relevant key(s) here! # I've tried to go through the various existing pipeline stages and put some of their keys here, but this may already be incomplete. @@ -63,7 +63,7 @@ def count_keys(self): counts = {} db = edb.get_analysis_timeseries_db() for key in self.keys_to_track: - count = db.find({"metadata.key": key, "user_id": self.testUUID}).count() + count = db.count_documents({"metadata.key": key, "user_id": self.testUUID}) counts[key] = count # print(counts) return counts