Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rero bugfix #75

Merged
merged 4 commits into from
Jan 23, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 66 additions & 26 deletions tests/importers/test_rero_importer.py
Original file line number Diff line number Diff line change
@@ -1,43 +1,83 @@
import pkg_resources
import bz2
import json
import logging
import os
from glob import glob

import pkg_resources

from text_importer.importers import CONTENTITEM_TYPE_IMAGE
from text_importer.importers.core import import_issues
from text_importer.importers.rero.detect import detect_issues
from text_importer.importers.rero.classes import ReroNewspaperIssue

import logging
from text_importer.importers.rero.detect import detect_issues

logger = logging.getLogger(__name__)


def test_import_issues():
"""Test the Olive XML importer with sample data."""

inp_dir = pkg_resources.resource_filename(
'text_importer',
'data/sample_data/RERO2/'
)

'text_importer',
'data/sample_data/RERO2/'
)
access_rights_file = pkg_resources.resource_filename(
'text_importer',
'data/sample_data/RERO2/rero2_access_rights.json'
)

'text_importer',
'data/sample_data/RERO2/rero2_access_rights.json'
)
issues = detect_issues(
base_dir=inp_dir,
access_rights=access_rights_file
)
base_dir=inp_dir,
access_rights=access_rights_file
)
assert issues is not None
assert len(issues) > 0

import_issues(
issues,
out_dir=pkg_resources.resource_filename('text_importer', 'data/out/'),
s3_bucket=None,
issue_class=ReroNewspaperIssue,
temp_dir=None,
image_dirs=None,
chunk_size=None
)


result = import_issues(
issues,
out_dir=pkg_resources.resource_filename('text_importer', 'data/out/'),
s3_bucket=None,
issue_class=ReroNewspaperIssue,
temp_dir=None,
image_dirs=None,
chunk_size=None
)
print(result)
def check_image_coordinates(issue_data):
items = issue_data['i']
images = [i for i in items if i['m']['tp'] == CONTENTITEM_TYPE_IMAGE]
return len(images) == 0 or all('c' in data['m'] and len(data['m']['c']) == 4 for data in images)


def test_image_coordinates():
inp_dir = pkg_resources.resource_filename(
'text_importer',
'data/sample_data/RERO2/'
)

out_dir = pkg_resources.resource_filename('text_importer', 'data/out/')
access_rights_file = pkg_resources.resource_filename(
'text_importer',
'data/sample_data/RERO2/rero2_access_rights.json'
)

issues = detect_issues(
base_dir=inp_dir,
access_rights=access_rights_file
)

assert issues is not None
assert len(issues) > 0

journals = set([x.journal for x in issues])
blobs = [f"{j}*.jsonl.bz2" for j in journals]
issue_files = [f for b in blobs for f in glob(os.path.join(out_dir, b))]
print(issue_files)

for filename in issue_files:
with bz2.open(filename, "rt") as bzinput:
for line in bzinput:
issue = json.loads(line)
assert check_image_coordinates(issue), "Images do not have coordinates"
Binary file added text_importer/data/out/BLB-1845-issues.jsonl.bz2
Binary file not shown.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Binary file added text_importer/data/out/EZR-1840-issues.jsonl.bz2
Binary file not shown.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Binary file added text_importer/data/out/FZG-1972-issues.jsonl.bz2
Binary file not shown.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Binary file added text_importer/data/out/LLE-1946-issues.jsonl.bz2
Binary file not shown.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
82 changes: 54 additions & 28 deletions text_importer/importers/rero/classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,10 @@ def _find_pages(self):
logger.critical(f"Could not find pages for {self.id}")

page_file_names = [
file
for file in os.listdir(alto_path)
if not file.startswith('.') and '.xml' in file
]
file
for file in os.listdir(alto_path)
if not file.startswith('.') and '.xml' in file
]

page_numbers = []

Expand All @@ -62,9 +62,9 @@ def _find_pages(self):
page_numbers.append(int(page_no))

page_canonical_names = [
"{}-p{}".format(self.id, str(page_n).zfill(4))
for page_n in page_numbers
]
"{}-p{}".format(self.id, str(page_n).zfill(4))
for page_n in page_numbers
]

self.pages = []
for filename, page_no, page_id in zip(
Expand Down Expand Up @@ -108,11 +108,11 @@ def _parse_content_parts(self, content_div) -> List[Dict[str, str]]:

parts.append(
{
'comp_role': comp_role,
'comp_id': comp_id,
'comp_fileid': comp_fileid,
'comp_page_no': comp_page_no
}
'comp_role': comp_role,
'comp_id': comp_id,
'comp_fileid': comp_fileid,
'comp_page_no': comp_page_no
}
)
return parts

Expand All @@ -138,23 +138,26 @@ def _parse_content_item(self, item_div, counter: int):
logger.warning(f"Found new content item type: {div_type}")

metadata = {
'id': "{}-i{}".format(self.id, str(counter).zfill(4)),
'tp': div_type,
'pp': [],
't': item_div.get('LABEL')
}
'id': "{}-i{}".format(self.id, str(counter).zfill(4)),
'tp': div_type,
'pp': [],
't': item_div.get('LABEL')
}

content_item = {
"m": metadata,
"l": {
"id": item_div.get('ID'),
"parts": self._parse_content_parts(item_div)
}
"m": metadata,
"l": {
"id": item_div.get('ID'),
"parts": self._parse_content_parts(item_div)
}
}
for p in content_item['l']['parts']:
pge_no = p["comp_page_no"]
if pge_no not in content_item['m']['pp']:
content_item['m']['pp'].append(pge_no)

if content_item['m']['tp'] == CONTENTITEM_TYPE_IMAGE:
content_item['m']['c'], content_item['iiif_link'] = self._get_image_info(content_item['l']['parts'])
return content_item

def _parse_content_items(self, mets_doc: BeautifulSoup):
Expand Down Expand Up @@ -197,9 +200,32 @@ def _parse_mets(self):
content_items = self._parse_content_items(mets_doc)

self.issue_data = {
"cdt": strftime("%Y-%m-%d %H:%M:%S"),
"i": content_items,
"id": self.id,
"ar": self.rights,
"pp": [p.id for p in self.pages]
}
"cdt": strftime("%Y-%m-%d %H:%M:%S"),
"i": content_items,
"id": self.id,
"ar": self.rights,
"pp": [p.id for p in self.pages]
}

def _get_image_info(self, parts):
# Fetch the legacy parts

assert len(parts) == 1, "Image has more than 1 part"
part = parts[0]

# Fetch page number and corresponding page
pge_nb = part['comp_page_no']
comp_id = part['comp_id']
page = [p for p in self.pages if p.number == pge_nb][0]

elements = page.xml.findAll("TextBlock", {"ID": comp_id})
assert len(elements) <= 1, "Image comp_id matches multiple TextBlock tags"
if len(elements) == 0:
return []

element = elements[0]
hpos, vpos, width, height = element.get('HPOS'), element.get('VPOS'), element.get('WIDTH'), element.get('HEIGHT')
coords = [int(hpos), int(vpos), int(width), int(height)]
iiif_link = os.path.join(IIIF_ENDPOINT_URL, page.id, ",".join([str(x) for x in coords]), 'full', '0', 'default.jpg')

return coords, iiif_link