-
Notifications
You must be signed in to change notification settings - Fork 13
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* wip * extracting from a initial range * removing example * range specification with abbreviations * detecting wrongly specified ranges * linting and lincesing * disable too-many-locals pylint * refactoring page range * refactoring page range * Update haystack_experimental/components/extractors/llm_metadata_extractor.py Co-authored-by: Sebastian Husch Lee <sjrl@users.noreply.github.com> * fixing range bug and cleaning unused code * range can also be provided at runtime * improving tests * Update haystack_experimental/components/extractors/llm_metadata_extractor.py Co-authored-by: Sebastian Husch Lee <sjrl@users.noreply.github.com> * PR comments * fixing types for mypy * fixing pylint and mypy * Small changes to run method, helps with pylint --------- Co-authored-by: Sebastian Husch Lee <sjrl@users.noreply.github.com> Co-authored-by: Sebastian Husch Lee <sjrl423@gmail.com>
- Loading branch information
1 parent
4e1b37a
commit 142b646
Showing
4 changed files
with
187 additions
and
32 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai> | ||
# | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
from typing import List, Union | ||
|
||
|
||
def expand_page_range(page_range: List[Union[str, int]]) -> List[int]: | ||
""" | ||
Takes a list of page numbers and ranges and expands them into a list of page numbers. | ||
For example, given a page_range=['1-3', '5', '8', '10-12'] the function will return [1, 2, 3, 5, 8, 10, 11, 12] | ||
:param page_range: List of page numbers and ranges | ||
:returns: | ||
An expanded list of page integers | ||
""" | ||
expanded_page_range = [] | ||
|
||
for page in page_range: | ||
if isinstance(page, int): | ||
# check if it's a range wrongly passed as an integer expression | ||
if "-" in str(page): | ||
msg = "range must be a string in the format 'start-end'" | ||
raise ValueError(f"Invalid page range: {page} - {msg}") | ||
expanded_page_range.append(page) | ||
|
||
elif isinstance(page, str) and page.isdigit(): | ||
expanded_page_range.append(int(page)) | ||
|
||
elif isinstance(page, str) and "-" in page: | ||
start, end = page.split("-") | ||
expanded_page_range.extend(range(int(start), int(end) + 1)) | ||
|
||
else: | ||
msg = "range must be a string in the format 'start-end' or an integer" | ||
raise ValueError(f"Invalid page range: {page} - {msg}") | ||
|
||
if not expanded_page_range: | ||
raise ValueError("No valid page numbers or ranges found in the input list") | ||
|
||
return expanded_page_range |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
import pytest | ||
|
||
from haystack_experimental.util.utils import expand_page_range | ||
|
||
def test_expand_page_range_valid_input(): | ||
assert expand_page_range([1, 3]) == [1, 3] | ||
assert expand_page_range(['1-3']) == [1, 2, 3] | ||
assert expand_page_range(['1-3', 5, 8, '10-12']) == [1,2,3,5,8,10,11,12] | ||
assert expand_page_range(['1-3', '5', '8', '10-12']) == [1, 2, 3, 5, 8, 10, 11, 12] | ||
assert expand_page_range(['1-3', 5, 8, '10-12', '15-20', 50]) == [1,2,3,5,8,10,11,12,15,16,17,18,19,20,50] | ||
|
||
|
||
def test_expand_page_range_invalid_input(): | ||
|
||
with pytest.raises(ValueError): | ||
expand_page_range(['1-3', 'non_digit_string', 8, '10-12', '15-20', '50']) | ||
|
||
with pytest.raises(ValueError): | ||
expand_page_range([1-3, 5, 8]) | ||
|
||
with pytest.raises(ValueError): | ||
expand_page_range([]) |