diff --git a/spellpy/spell.py b/spellpy/spell.py index a335d24..c6d024e 100644 --- a/spellpy/spell.py +++ b/spellpy/spell.py @@ -122,12 +122,8 @@ def LCSMatch(self, LCSMap, seq): maxLen = -1 maxLCSObject = None - set_seq = set(seq) size_seq = len(seq) for LCSObject in LCSMap: - set_template = set(LCSObject.logTemplate) - if len(set_seq & set_template) < 0.5 * size_seq: - continue lcs = self.LCS(seq, LCSObject.logTemplate) if len(lcs) > maxLen or (len(lcs) == maxLen and len(LCSObject.logTemplate) < len(maxLCSObject.logTemplate)): maxLen = len(lcs) diff --git a/tests/test_spellpy.py b/tests/test_spellpy.py index fa428ed..4bfcd44 100644 --- a/tests/test_spellpy.py +++ b/tests/test_spellpy.py @@ -94,6 +94,19 @@ def test_getTemplate(self): new_template = self.parser.getTemplate(lcs, seq) self.assertListEqual(new_template, expected_template) + def test_LCSMatch_with_repeated_tokens(self): + # This test case simulates the scenario described in the issue. + # 'seq' has many repeated tokens, which makes len(set(seq)) small. + # The old implementation would fail this test. + logmessageL = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'] + seq = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'K', 'K', 'K', 'K', 'K', 'K', 'K', 'K', 'K'] + logID = 0 + newCluster = LCSObject(logTemplate=logmessageL, logIDL=[logID]) + + retLogClust = self.parser.LCSMatch([newCluster], seq) + self.assertIsNotNone(retLogClust) + self.assertListEqual(retLogClust.logTemplate, newCluster.logTemplate) + def helper(rootNode): if rootNode.childD == dict():