Skip to content

Commit

Permalink
simplify further
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Nov 3, 2023
1 parent dbf474c commit 0a4128e
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 39 deletions.
24 changes: 7 additions & 17 deletions htmldate/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,9 +355,7 @@ def select_candidate(
occurrences: Counter_Type[str],
catch: Pattern[str],
yearpat: Pattern[str],
original_date: bool,
min_date: datetime,
max_date: datetime,
options: Extractor,
) -> Optional[Match[str]]:
"""Select a candidate among the most frequent matches"""
if not occurrences or len(occurrences) > MAX_POSSIBLE_CANDIDATES:
Expand All @@ -372,7 +370,7 @@ def select_candidate(
firstselect = occurrences.most_common(10)
LOGGER.debug("firstselect: %s", firstselect)
# sort and find probable candidates
bestones = sorted(firstselect, reverse=not original_date)[:2]
bestones = sorted(firstselect, reverse=not options.original)[:2]
LOGGER.debug("bestones: %s", bestones)

# plausibility heuristics
Expand All @@ -385,7 +383,7 @@ def select_candidate(
years[i] = year_match[1]
dateobject = datetime(int(year_match[1]), 1, 1)
validation[i] = is_valid_date(
dateobject, "%Y", earliest=min_date, latest=max_date
dateobject, "%Y", earliest=options.min, latest=options.max
)

# safety net: plausibility
Expand Down Expand Up @@ -422,9 +420,7 @@ def search_pattern(
earliest=options.min,
latest=options.max,
)
return select_candidate(
candidates, catch, yearpat, options.original, options.min, options.max
)
return select_candidate(candidates, catch, yearpat, options)


@lru_cache(maxsize=CACHE_SIZE)
Expand Down Expand Up @@ -651,9 +647,7 @@ def search_page(htmlstring: str, options: Extractor) -> Optional[str]:
replacement[candidate] = candidates[item]
candidates = Counter(replacement)
# select
bestmatch = select_candidate(
candidates, YMD_PATTERN, YMD_YEAR, options.original, options.min, options.max
)
bestmatch = select_candidate(candidates, YMD_PATTERN, YMD_YEAR, options)
result = filter_ymd_candidate(
bestmatch,
SELECT_YMD_PATTERN,
Expand Down Expand Up @@ -702,9 +696,7 @@ def search_page(htmlstring: str, options: Extractor) -> Optional[str]:
candidate = normalize_match(match)
replacement[candidate] = candidates[item]
candidates = Counter(replacement)
bestmatch = select_candidate(
candidates, YMD_PATTERN, YMD_YEAR, options.original, options.min, options.max
)
bestmatch = select_candidate(candidates, YMD_PATTERN, YMD_YEAR, options)
result = filter_ymd_candidate(
bestmatch,
SLASHES_PATTERN,
Expand Down Expand Up @@ -760,9 +752,7 @@ def search_page(htmlstring: str, options: Extractor) -> Optional[str]:
replacement[candidate] = candidates[item]
candidates = Counter(replacement)
# select
bestmatch = select_candidate(
candidates, YMD_PATTERN, YMD_YEAR, options.original, options.min, options.max
)
bestmatch = select_candidate(candidates, YMD_PATTERN, YMD_YEAR, options)
result = filter_ymd_candidate(
bestmatch,
MMYYYY_PATTERN,
Expand Down
34 changes: 12 additions & 22 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -860,9 +860,9 @@ def test_compare_reference():
assert compare_reference(1517500000, "2018-02-01", options) == 1517500000


def test_candidate_selection(min_date=MIN_DATE, max_date=LATEST_POSSIBLE):
def test_candidate_selection():
"""test the algorithm for several candidates"""
original_date = False
options = Extractor(False, LATEST_POSSIBLE, MIN_DATE, False, OUTPUTFORMAT)
# patterns
catch = re.compile(r"([0-9]{4})-([0-9]{2})-([0-9]{2})")
yearpat = re.compile(r"^([0-9]{4})")
Expand All @@ -878,9 +878,7 @@ def test_candidate_selection(min_date=MIN_DATE, max_date=LATEST_POSSIBLE):
"2-28",
]
)
result = select_candidate(
occurrences, catch, yearpat, original_date, min_date, max_date
)
result = select_candidate(occurrences, catch, yearpat, options)
assert result is None
# plausible
occurrences = Counter(
Expand All @@ -894,38 +892,30 @@ def test_candidate_selection(min_date=MIN_DATE, max_date=LATEST_POSSIBLE):
"2017-11-28",
]
)
result = select_candidate(
occurrences, catch, yearpat, original_date, min_date, max_date
)
result = select_candidate(occurrences, catch, yearpat, options)
assert result.group(0) == "2017-11-28"
original_date = True
result = select_candidate(
occurrences, catch, yearpat, original_date, min_date, max_date
)

options = Extractor(False, LATEST_POSSIBLE, MIN_DATE, True, OUTPUTFORMAT)
result = select_candidate(occurrences, catch, yearpat, options)
assert result.group(0) == "2016-07-12"
# mix plausible/implausible
occurrences = Counter(
["2116-12-23", "2116-12-23", "2116-12-23", "2017-08-11", "2017-08-11"]
)
result = select_candidate(
occurrences, catch, yearpat, original_date, min_date, max_date
)
result = select_candidate(occurrences, catch, yearpat, options)
assert result.group(0) == "2017-08-11"
original_date = False

options = Extractor(False, LATEST_POSSIBLE, MIN_DATE, False, OUTPUTFORMAT)
occurrences = Counter(
["2116-12-23", "2116-12-23", "2116-12-23", "2017-08-11", "2017-08-11"]
)
result = select_candidate(
occurrences, catch, yearpat, original_date, min_date, max_date
)
result = select_candidate(occurrences, catch, yearpat, options)
assert result.group(0) == "2017-08-11"
# taking date present twice, corner case
occurrences = Counter(
["2016-12-23", "2016-12-23", "2017-08-11", "2017-08-11", "2017-08-11"]
)
result = select_candidate(
occurrences, catch, yearpat, original_date, min_date, max_date
)
result = select_candidate(occurrences, catch, yearpat, options)
assert result.group(0) == "2016-12-23"


Expand Down

0 comments on commit 0a4128e

Please sign in to comment.