Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Additional column detection, pdf download for papers and fact sheets #22

Merged
merged 2 commits into from
Jul 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 4 additions & 17 deletions elm/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,21 +206,6 @@ async def clean_txt_async(self, ignore_error=None, rate_limit=40e3):

return clean_pages

def is_double_col(self, separator=' '):
"""Does the text look like it has multiple vertical text columns?

Parameters
----------
separator : str
Heuristic split string to look for spaces between columns

Returns
-------
out : bool
True if more than one vertical text column
"""
return is_multi_col(self.full, separator=separator)

def clean_poppler(self, layout=True):
"""Clean the pdf using the poppler pdftotxt utility

Expand Down Expand Up @@ -351,21 +336,23 @@ def clean_headers(self, char_thresh=0.6, page_thresh=0.8, split_on='\n',
self.full = combine_pages(self.pages)
return self.full

def convert_to_txt(self, txt_fp):
def convert_to_txt(self, txt_fp, separator=' '):
"""Function to convert contents of pdf document to txt file.

Parameters
----------
txt_fp: str
Directory for output txt file.
separator : str
Heuristic split string to look for spaces between columns

Returns
-------
text : str
Text string containing contents from pdf
"""
text = self.clean_poppler(layout=True)
if self.is_double_col():
if is_multi_col(text, separator=separator):
text = self.clean_poppler(layout=False)
text = self.clean_headers(char_thresh=0.6, page_thresh=0.8,
split_on='\n',
Expand Down
16 changes: 13 additions & 3 deletions elm/utilities/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
logger = logging.getLogger(__name__)


def is_multi_col(text, separator=" "):
def is_multi_col(text, separator=" ", threshold_ratio=0.35):
spodgorny9 marked this conversation as resolved.
Show resolved Hide resolved
"""Does the text look like it has multiple vertical text columns?

Parameters
Expand All @@ -23,14 +23,24 @@ def is_multi_col(text, separator=" "):
columns.
separator : str
Heuristic split string to look for spaces between columns
threshold_ratio : float
Portion of lines containing the separator at which point
the text should be classified as multi-column.

Returns
-------
out : bool
True if more than one vertical text column
"""
n_cols = [len(line.strip().split(separator)) for line in text.split("\n")]
return np.median(n_cols) >= 2
lines = text.split("\n")
total_lines = len(lines)

gap_lines = [line for line in lines if separator in line.strip()]
cols = len(gap_lines)

ratio = cols / total_lines

return ratio >= threshold_ratio


def remove_blank_pages(pages):
Expand Down
15 changes: 10 additions & 5 deletions elm/web/rhub.py
Original file line number Diff line number Diff line change
Expand Up @@ -564,10 +564,13 @@ def id(self):
id : str
Publication Number.
"""
group = self.get('keywordGroups')[0]
cont = group.get('keywordContainers')[0]
id = cont.get('freeKeywords')[0].get('freeKeywords')[0]
id = id.replace('/', '-')
try:
group = self.get('keywordGroups')[0]
cont = group.get('keywordContainers')[0]
id = cont.get('freeKeywords')[0].get('freeKeywords')[0]
id = id.replace('/', '-')
except TypeError:
id = self.get('externalId')

return id

Expand Down Expand Up @@ -690,7 +693,9 @@ def download(self, pdf_dir, txt_dir):
pdf_url = self.links[1]
abstract = self.abstract

if category != 'Technical Report':
pdf_categories = ['Technical Report', 'Paper', 'Fact Sheet']

if category not in pdf_categories:
fn = self.id.replace('/', '-') + '.txt'
fp = os.path.join(txt_dir, fn)
if not os.path.exists(fp):
Expand Down
6 changes: 4 additions & 2 deletions examples/research_hub/retrieve_docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,15 +67,17 @@
publications.download(PDF_DIR, TXT_DIR)
pubs_meta = publications.meta()

pdf_categories = ['Technical Report', 'Paper', 'Fact Sheet']

pubs_meta['fn'] = pubs_meta.apply(lambda row:
row['id'] + '.pdf'
if row['category'] == 'Technical Report'
if row['category'] in pdf_categories
and row['pdf_url'] is not None
and row['pdf_url'].endswith('.pdf')
else row['id'] + '.txt', axis=1)
pubs_meta['fp'] = pubs_meta.apply(lambda row:
PDF_DIR + row['id'] + '.pdf'
if row['category'] == 'Technical Report'
if row['category'] in pdf_categories
and row['pdf_url'] is not None
and row['pdf_url'].endswith('.pdf')
else TXT_DIR + row['fn'], axis=1)
Expand Down
14 changes: 14 additions & 0 deletions tests/utilities/test_utilities_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,20 @@ def test_is_multi_col():
double column!
"""
)
assert is_multi_col(
"""
Text that has multiple
columns and also has
lines without columns.
"""
)
assert not is_multi_col(
"""
Text that is mostly single
column but might have some
weird spacing like this.
"""
)


def test_remove_blank_pages():
Expand Down
Loading