Skip to content

Commit

Permalink
Merge pull request #75 from impresso/RERO-bugfix
Browse files Browse the repository at this point in the history
Rero bugfix

closes issue #75
  • Loading branch information
Matteo Romanello authored Jan 23, 2020
2 parents 929eae4 + acc9b3f commit 3457dbc
Show file tree
Hide file tree
Showing 47 changed files with 157 additions and 54 deletions.
92 changes: 66 additions & 26 deletions tests/importers/test_rero_importer.py
Original file line number Diff line number Diff line change
@@ -1,43 +1,83 @@
import pkg_resources
import bz2
import json
import logging
import os
from glob import glob

import pkg_resources

from text_importer.importers import CONTENTITEM_TYPE_IMAGE
from text_importer.importers.core import import_issues
from text_importer.importers.rero.detect import detect_issues
from text_importer.importers.rero.classes import ReroNewspaperIssue

import logging
from text_importer.importers.rero.detect import detect_issues

logger = logging.getLogger(__name__)


def test_import_issues():
"""Test the Olive XML importer with sample data."""

inp_dir = pkg_resources.resource_filename(
'text_importer',
'data/sample_data/RERO2/'
)

'text_importer',
'data/sample_data/RERO2/'
)
access_rights_file = pkg_resources.resource_filename(
'text_importer',
'data/sample_data/RERO2/rero2_access_rights.json'
)

'text_importer',
'data/sample_data/RERO2/rero2_access_rights.json'
)
issues = detect_issues(
base_dir=inp_dir,
access_rights=access_rights_file
)
base_dir=inp_dir,
access_rights=access_rights_file
)
assert issues is not None
assert len(issues) > 0

import_issues(
issues,
out_dir=pkg_resources.resource_filename('text_importer', 'data/out/'),
s3_bucket=None,
issue_class=ReroNewspaperIssue,
temp_dir=None,
image_dirs=None,
chunk_size=None
)


result = import_issues(
issues,
out_dir=pkg_resources.resource_filename('text_importer', 'data/out/'),
s3_bucket=None,
issue_class=ReroNewspaperIssue,
temp_dir=None,
image_dirs=None,
chunk_size=None
)
print(result)
def check_image_coordinates(issue_data):
items = issue_data['i']
images = [i for i in items if i['m']['tp'] == CONTENTITEM_TYPE_IMAGE]
return len(images) == 0 or all('c' in data['m'] and len(data['m']['c']) == 4 for data in images)


def test_image_coordinates():
inp_dir = pkg_resources.resource_filename(
'text_importer',
'data/sample_data/RERO2/'
)

out_dir = pkg_resources.resource_filename('text_importer', 'data/out/')
access_rights_file = pkg_resources.resource_filename(
'text_importer',
'data/sample_data/RERO2/rero2_access_rights.json'
)

issues = detect_issues(
base_dir=inp_dir,
access_rights=access_rights_file
)

assert issues is not None
assert len(issues) > 0

journals = set([x.journal for x in issues])
blobs = [f"{j}*.jsonl.bz2" for j in journals]
issue_files = [f for b in blobs for f in glob(os.path.join(out_dir, b))]
print(issue_files)

for filename in issue_files:
with bz2.open(filename, "rt") as bzinput:
for line in bzinput:
issue = json.loads(line)
assert check_image_coordinates(issue), "Images do not have coordinates"
Binary file added text_importer/data/out/BLB-1845-issues.jsonl.bz2
Binary file not shown.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Binary file added text_importer/data/out/EZR-1840-issues.jsonl.bz2
Binary file not shown.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Binary file added text_importer/data/out/FZG-1972-issues.jsonl.bz2
Binary file not shown.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Binary file added text_importer/data/out/LLE-1946-issues.jsonl.bz2
Binary file not shown.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
82 changes: 54 additions & 28 deletions text_importer/importers/rero/classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,10 @@ def _find_pages(self):
logger.critical(f"Could not find pages for {self.id}")

page_file_names = [
file
for file in os.listdir(alto_path)
if not file.startswith('.') and '.xml' in file
]
file
for file in os.listdir(alto_path)
if not file.startswith('.') and '.xml' in file
]

page_numbers = []

Expand All @@ -62,9 +62,9 @@ def _find_pages(self):
page_numbers.append(int(page_no))

page_canonical_names = [
"{}-p{}".format(self.id, str(page_n).zfill(4))
for page_n in page_numbers
]
"{}-p{}".format(self.id, str(page_n).zfill(4))
for page_n in page_numbers
]

self.pages = []
for filename, page_no, page_id in zip(
Expand Down Expand Up @@ -108,11 +108,11 @@ def _parse_content_parts(self, content_div) -> List[Dict[str, str]]:

parts.append(
{
'comp_role': comp_role,
'comp_id': comp_id,
'comp_fileid': comp_fileid,
'comp_page_no': comp_page_no
}
'comp_role': comp_role,
'comp_id': comp_id,
'comp_fileid': comp_fileid,
'comp_page_no': comp_page_no
}
)
return parts

Expand All @@ -138,23 +138,26 @@ def _parse_content_item(self, item_div, counter: int):
logger.warning(f"Found new content item type: {div_type}")

metadata = {
'id': "{}-i{}".format(self.id, str(counter).zfill(4)),
'tp': div_type,
'pp': [],
't': item_div.get('LABEL')
}
'id': "{}-i{}".format(self.id, str(counter).zfill(4)),
'tp': div_type,
'pp': [],
't': item_div.get('LABEL')
}

content_item = {
"m": metadata,
"l": {
"id": item_div.get('ID'),
"parts": self._parse_content_parts(item_div)
}
"m": metadata,
"l": {
"id": item_div.get('ID'),
"parts": self._parse_content_parts(item_div)
}
}
for p in content_item['l']['parts']:
pge_no = p["comp_page_no"]
if pge_no not in content_item['m']['pp']:
content_item['m']['pp'].append(pge_no)

if content_item['m']['tp'] == CONTENTITEM_TYPE_IMAGE:
content_item['m']['c'], content_item['iiif_link'] = self._get_image_info(content_item['l']['parts'])
return content_item

def _parse_content_items(self, mets_doc: BeautifulSoup):
Expand Down Expand Up @@ -197,9 +200,32 @@ def _parse_mets(self):
content_items = self._parse_content_items(mets_doc)

self.issue_data = {
"cdt": strftime("%Y-%m-%d %H:%M:%S"),
"i": content_items,
"id": self.id,
"ar": self.rights,
"pp": [p.id for p in self.pages]
}
"cdt": strftime("%Y-%m-%d %H:%M:%S"),
"i": content_items,
"id": self.id,
"ar": self.rights,
"pp": [p.id for p in self.pages]
}

def _get_image_info(self, parts):
# Fetch the legacy parts

assert len(parts) == 1, "Image has more than 1 part"
part = parts[0]

# Fetch page number and corresponding page
pge_nb = part['comp_page_no']
comp_id = part['comp_id']
page = [p for p in self.pages if p.number == pge_nb][0]

elements = page.xml.findAll("TextBlock", {"ID": comp_id})
assert len(elements) <= 1, "Image comp_id matches multiple TextBlock tags"
if len(elements) == 0:
return []

element = elements[0]
hpos, vpos, width, height = element.get('HPOS'), element.get('VPOS'), element.get('WIDTH'), element.get('HEIGHT')
coords = [int(hpos), int(vpos), int(width), int(height)]
iiif_link = os.path.join(IIIF_ENDPOINT_URL, page.id, ",".join([str(x) for x in coords]), 'full', '0', 'default.jpg')

return coords, iiif_link

0 comments on commit 3457dbc

Please sign in to comment.