Skip to content

Commit

Permalink
test(matchers): add tests for matcherse (#472)
Browse files Browse the repository at this point in the history
See #380
  • Loading branch information
Hiromu Hota authored Jul 5, 2020
1 parent 21a5a1e commit 6cc0a7d
Show file tree
Hide file tree
Showing 3 changed files with 256 additions and 4 deletions.
2 changes: 1 addition & 1 deletion .isort.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ force_grid_wrap=0
combine_as_imports=True
line_length=88
known_first_party = fonduer,tests
known_third_party = IPython,bs4,cloudpickle,editdistance,emmental,lxml,mlflow,numpy,packaging,pandas,pytest,scipy,setuptools,snorkel,spacy,sqlalchemy,tensorboardX,torch,treedlib,wand,yaml
known_third_party = IPython,bs4,cloudpickle,editdistance,emmental,lxml,mlflow,nltk,numpy,packaging,pandas,pytest,scipy,setuptools,snorkel,spacy,sqlalchemy,tensorboardX,torch,treedlib,wand,yaml
1 change: 1 addition & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ black>=18.9b0
flake8>=3.8.0
flake8-docstrings
mypy
nltk
isort
pre-commit
pytest
Expand Down
257 changes: 254 additions & 3 deletions tests/candidates/test_matchers.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,30 @@
"""Fonduer unit tests for matchers."""
from unittest.mock import Mock

import pytest
from nltk.stem.porter import PorterStemmer

from fonduer.candidates.matchers import Intersect, Inverse, RegexMatchSpan, Union
from fonduer.candidates.mentions import MentionNgrams
from fonduer.candidates.matchers import (
Concat,
DateMatcher,
DictionaryMatch,
Intersect,
Inverse,
LambdaFunctionFigureMatcher,
LambdaFunctionMatcher,
LocationMatcher,
MiscMatcher,
NumberMatcher,
OrganizationMatcher,
PersonMatcher,
RegexMatchEach,
RegexMatchSpan,
Union,
)
from fonduer.candidates.mentions import MentionFigures, MentionNgrams
from fonduer.candidates.models.span_mention import TemporarySpanMention
from fonduer.parser.lingual_parser.spacy_parser import SpacyParser
from fonduer.parser.models import Document, Sentence
from fonduer.parser.models import Document, Figure, Sentence


@pytest.fixture()
Expand Down Expand Up @@ -66,6 +85,10 @@ def test_union(doc_setup):
"is apple",
}

# Unsupported option should raise an exception
with pytest.raises(Exception):
Union(matcher0, matcher1, long_match_only=False)


def test_intersect(doc_setup):
"""Test intersect matcher."""
Expand Down Expand Up @@ -173,3 +196,231 @@ def test_inverse(doc_setup):
# Check if Inverse raises an error when two child matchers are provided.
with pytest.raises(ValueError):
Inverse(matcher0, matcher0)


def test_cancat(doc_setup):
"""Test Concat matcher."""
doc = doc_setup
space = MentionNgrams(n_min=1, n_max=2)

# Match any span that contains "this"
matcher0 = RegexMatchSpan(
rgx=r"this", search=False, full_match=False, longest_match_only=False
)
# Match any span that contains "is"
matcher1 = RegexMatchSpan(
rgx=r"is", search=False, full_match=False, longest_match_only=False
)
matcher = Concat(matcher0, matcher1)
assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"This is"}

# Test if matcher raises an error when _f is given non-TemporarySpanMention
with pytest.raises(ValueError):
list(matcher.apply(doc.sentences[0].words))

# Test if an error is raised when the number of child matchers is not 2.
matcher = Concat(matcher0)
with pytest.raises(ValueError):
list(matcher.apply(space.apply(doc)))

# Test with left_required=False
matcher = Concat(matcher0, matcher1, left_required=False)
assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {
"This is",
"is apple",
}

# Test with right_required=False
matcher = Concat(matcher0, matcher1, right_required=False)
assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"This is"}

# Test with permutations=False
matcher = Concat(matcher1, matcher0, permutations=False)
assert set(matcher.apply(space.apply(doc))) == set()
# Test with permutations=True
matcher = Concat(matcher1, matcher0, permutations=True)
assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"This is"}

# TODO: Add a test for ignore_sep=False


def test_dictionary_match(doc_setup):
"""Test DictionaryMatch matcher."""
doc = doc_setup
space = MentionNgrams(n_min=1, n_max=1)

# Test with a list of str
matcher = DictionaryMatch(d=["this"])
assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"This"}

# Test without a dictionary
with pytest.raises(Exception):
DictionaryMatch()

# TODO: test with plural words
matcher = DictionaryMatch(d=["is"], stemmer=PorterStemmer())
assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"is"}

# Test if matcher raises an error when _f is given non-TemporarySpanMention
matcher = DictionaryMatch(d=["this"])
with pytest.raises(ValueError):
list(matcher.apply(doc.sentences[0].words))


def test_do_not_use_stemmer_when_UnicodeDecodeError():
"""Test DictionaryMatch when stemmer causes UnicodeDecodeError."""
stemmer = PorterStemmer()
matcher = DictionaryMatch(d=["is"], stemmer=stemmer)
# _stem(w) should return a word stem.
assert matcher._stem("caresses") == "caress"

stemmer.stem = Mock(
side_effect=UnicodeDecodeError("dummycodec", b"\x00\x00", 1, 2, "Dummy !")
)
matcher = DictionaryMatch(d=["is"], stemmer=stemmer)
# _stem(w) should return w as stemmer.stem raises UnicodeDecodeError.
assert matcher._stem("caresses") == "caresses"


def test_lambda_function_matcher(doc_setup):
"""Test DictionaryMatch matcher."""
doc = doc_setup
space = MentionNgrams(n_min=1, n_max=1)

# Test with a lambda function
matcher = LambdaFunctionMatcher(func=lambda x: True)
assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {
"This",
"is",
"apple",
}

# Test if matcher raises an error when _f is given non-TemporarySpanMention
with pytest.raises(ValueError):
list(matcher.apply(doc.sentences[0].words))

# Test if an error raised when a func is not provided.
with pytest.raises(Exception):
LambdaFunctionMatcher()


def test_regex_match(doc_setup):
"""Test RegexMatch matcher."""
doc = doc_setup
space = MentionNgrams(n_min=1, n_max=2)

# a wrong option name should raise an excetiopn
with pytest.raises(Exception):
RegexMatchSpan(regex=r"apple")

# Test if matcher raises an error when _f is given non-TemporarySpanMention
matcher = RegexMatchSpan(rgx=r"apple")
with pytest.raises(ValueError):
list(matcher.apply(doc.sentences[0].words))

matcher = RegexMatchEach(rgx=r"apple")
with pytest.raises(ValueError):
list(matcher.apply(doc.sentences[0].words))

# Test if RegexMatchEach works as expected.
assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"apple"}

# Test ignore_case option
matcher = RegexMatchEach(rgx=r"Apple", ignore_case=False)
assert list(matcher.apply(space.apply(doc))) == []


def test_ner_matchers():
"""Test different ner type matchers."""
# Set up a document
doc = Document(id=1, name="test", stable_id="1::document:0:0")
doc.text = " ".join(
[
"Tim Cook was born in USA in 1960.",
"He is the CEO of Apple.",
"He sold 100 million of iPhone.",
]
)
lingual_parser = SpacyParser("en")
for parts in lingual_parser.split_sentences(doc.text):
parts["document"] = doc
Sentence(**parts)
# Manually attach ner_tags as the result from spacy may fluctuate.
doc.sentences[0].ner_tags = [
"PERSON",
"PERSON",
"O",
"O",
"O",
"GPE",
"O",
"DATE",
"O",
]
doc.sentences[1].ner_tags = ["O", "O", "O", "O", "O", "ORG", "O"]
# TODO: replace "NUMBER" with "CARDINAL" (#473)
doc.sentences[2].ner_tags = ["O", "O", "NUMBER", "NUMBER", "O", "MISC", "O"]

# the length of words and that of ner_tags should match.
assert len(doc.sentences[0].words) == len(doc.sentences[0].ner_tags)
assert len(doc.sentences[1].words) == len(doc.sentences[1].ner_tags)

space = MentionNgrams(n_min=1, n_max=2)

# Test if PersonMatcher works as expected
matcher = PersonMatcher()
assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"Tim Cook"}

# Test if LocationMatcher works as expected
matcher = LocationMatcher()
assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"USA"}

# Test if DateMatcher works as expected
matcher = DateMatcher()
assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"1960"}

# Test if OrganizationMatcher works as expected
matcher = OrganizationMatcher()
assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"Apple"}

# Test if NumberMatcher works as expected
matcher = NumberMatcher()
assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {
"100 million"
}

# Test if MiscMatcher works as expected
matcher = MiscMatcher()
assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"iPhone"}


def test_figure_matcher(doc_setup):
"""Test matchers for figures."""
doc = doc_setup
# Create two dummy figures
Figure(id=2, document=doc)
Figure(id=3, document=doc)
assert len(doc.figures) == 2

space = MentionFigures()
assert len(list(space.apply(doc))) == 2

# Set up a matcher that matches figures with id==2.
matcher = LambdaFunctionFigureMatcher(
func=lambda tf: True if tf.figure.id == 2 else False
)

# Test if matcher only matches the first figure.
assert len(list(matcher.apply(space.apply(doc)))) == 1
assert set(tf.figure.id for tf in matcher.apply(space.apply(doc))) == {2}

# The keyword arg should be "func"
with pytest.raises(Exception):
LambdaFunctionFigureMatcher(
function=lambda tf: True if tf.figure.id == 2 else False
)

# LambdaFunctionFigureMatcher only supports TemporaryFigureMention.
space = MentionNgrams(n_min=1, n_max=2)
with pytest.raises(ValueError):
list(matcher.apply(space.apply(doc)))

0 comments on commit 6cc0a7d

Please sign in to comment.