Merge pull request #75 from impresso/RERO-bugfix

Rero bugfix closes issue #75
impresso · Jan 23, 2020 · 3457dbc · 3457dbc
2 parents 929eae4 + acc9b3f
commit 3457dbc
Show file tree

Hide file tree

Showing 47 changed files with 157 additions and 54 deletions.
diff --git a/tests/importers/test_rero_importer.py b/tests/importers/test_rero_importer.py
@@ -1,43 +1,83 @@
-import pkg_resources
+import bz2
+import json
+import logging
+import os
+from glob import glob
 
+import pkg_resources
 
+from text_importer.importers import CONTENTITEM_TYPE_IMAGE
 from text_importer.importers.core import import_issues
-from text_importer.importers.rero.detect import detect_issues
 from text_importer.importers.rero.classes import ReroNewspaperIssue
-
-import logging
+from text_importer.importers.rero.detect import detect_issues
 
 logger = logging.getLogger(__name__)
 
 
 def test_import_issues():
     """Test the Olive XML importer with sample data."""
-
+    
     inp_dir = pkg_resources.resource_filename(
-        'text_importer',
-        'data/sample_data/RERO2/'
-    )
-
+            'text_importer',
+            'data/sample_data/RERO2/'
+            )
+    
     access_rights_file = pkg_resources.resource_filename(
-        'text_importer',
-        'data/sample_data/RERO2/rero2_access_rights.json'
-    )
-
+            'text_importer',
+            'data/sample_data/RERO2/rero2_access_rights.json'
+            )
+    
     issues = detect_issues(
-        base_dir=inp_dir,
-        access_rights=access_rights_file
-    )
+            base_dir=inp_dir,
+            access_rights=access_rights_file
+            )
     assert issues is not None
     assert len(issues) > 0
+
+    import_issues(
+            issues,
+            out_dir=pkg_resources.resource_filename('text_importer', 'data/out/'),
+            s3_bucket=None,
+            issue_class=ReroNewspaperIssue,
+            temp_dir=None,
+            image_dirs=None,
+            chunk_size=None
+            )
+
 
-    result = import_issues(
-        issues,
-        out_dir=pkg_resources.resource_filename('text_importer', 'data/out/'),
-        s3_bucket=None,
-        issue_class=ReroNewspaperIssue,
-        temp_dir=None,
-        image_dirs=None,
-        chunk_size=None
-    )
-    print(result)
+def check_image_coordinates(issue_data):
+    items = issue_data['i']
+    images = [i for i in items if i['m']['tp'] == CONTENTITEM_TYPE_IMAGE]
+    return len(images) == 0 or all('c' in data['m'] and len(data['m']['c']) == 4 for data in images)
 
+
+def test_image_coordinates():
+    inp_dir = pkg_resources.resource_filename(
+            'text_importer',
+            'data/sample_data/RERO2/'
+            )
+
+    out_dir = pkg_resources.resource_filename('text_importer', 'data/out/')
+    access_rights_file = pkg_resources.resource_filename(
+            'text_importer',
+            'data/sample_data/RERO2/rero2_access_rights.json'
+            )
+
+    issues = detect_issues(
+            base_dir=inp_dir,
+            access_rights=access_rights_file
+            )
+
+    assert issues is not None
+    assert len(issues) > 0
+
+    journals = set([x.journal for x in issues])
+    blobs = [f"{j}*.jsonl.bz2" for j in journals]
+    issue_files = [f for b in blobs for f in glob(os.path.join(out_dir, b))]
+    print(issue_files)
+
+    for filename in issue_files:
+        with bz2.open(filename, "rt") as bzinput:
+            for line in bzinput:
+                issue = json.loads(line)
+                assert check_image_coordinates(issue), "Images do not have coordinates"
diff --git a/text_importer/data/out/BLB-1845-issues.jsonl.bz2 b/text_importer/data/out/BLB-1845-issues.jsonl.bz2
diff --git a/text_importer/data/out/BLB/1845/12/28/a/BLB-1845-12-28-a-p0001.json b/text_importer/data/out/BLB/1845/12/28/a/BLB-1845-12-28-a-p0001.json
diff --git a/text_importer/data/out/BLB/1845/12/28/a/BLB-1845-12-28-a-p0002.json b/text_importer/data/out/BLB/1845/12/28/a/BLB-1845-12-28-a-p0002.json
diff --git a/text_importer/data/out/BLB/1845/12/28/a/BLB-1845-12-28-a-p0003.json b/text_importer/data/out/BLB/1845/12/28/a/BLB-1845-12-28-a-p0003.json
diff --git a/text_importer/data/out/BLB/1845/12/28/a/BLB-1845-12-28-a-p0004.json b/text_importer/data/out/BLB/1845/12/28/a/BLB-1845-12-28-a-p0004.json
diff --git a/text_importer/data/out/BLB/1845/12/28/a/BLB-1845-12-28-a-p0005.json b/text_importer/data/out/BLB/1845/12/28/a/BLB-1845-12-28-a-p0005.json
diff --git a/text_importer/data/out/EZR-1840-issues.jsonl.bz2 b/text_importer/data/out/EZR-1840-issues.jsonl.bz2
diff --git a/text_importer/data/out/EZR/1840/02/21/a/EZR-1840-02-21-a-p0001.json b/text_importer/data/out/EZR/1840/02/21/a/EZR-1840-02-21-a-p0001.json
diff --git a/text_importer/data/out/EZR/1840/02/21/a/EZR-1840-02-21-a-p0002.json b/text_importer/data/out/EZR/1840/02/21/a/EZR-1840-02-21-a-p0002.json
diff --git a/text_importer/data/out/EZR/1840/02/21/a/EZR-1840-02-21-a-p0003.json b/text_importer/data/out/EZR/1840/02/21/a/EZR-1840-02-21-a-p0003.json
diff --git a/text_importer/data/out/EZR/1840/02/21/a/EZR-1840-02-21-a-p0004.json b/text_importer/data/out/EZR/1840/02/21/a/EZR-1840-02-21-a-p0004.json
diff --git a/text_importer/data/out/EZR/1840/02/21/a/EZR-1840-02-21-a-p0005.json b/text_importer/data/out/EZR/1840/02/21/a/EZR-1840-02-21-a-p0005.json
diff --git a/text_importer/data/out/EZR/1840/02/21/a/EZR-1840-02-21-a-p0006.json b/text_importer/data/out/EZR/1840/02/21/a/EZR-1840-02-21-a-p0006.json
diff --git a/text_importer/data/out/FZG-1972-issues.jsonl.bz2 b/text_importer/data/out/FZG-1972-issues.jsonl.bz2
diff --git a/text_importer/data/out/FZG/1972/03/02/a/FZG-1972-03-02-a-p0001.json b/text_importer/data/out/FZG/1972/03/02/a/FZG-1972-03-02-a-p0001.json
diff --git a/text_importer/data/out/FZG/1972/03/02/a/FZG-1972-03-02-a-p0002.json b/text_importer/data/out/FZG/1972/03/02/a/FZG-1972-03-02-a-p0002.json
diff --git a/text_importer/data/out/FZG/1972/03/02/a/FZG-1972-03-02-a-p0003.json b/text_importer/data/out/FZG/1972/03/02/a/FZG-1972-03-02-a-p0003.json
diff --git a/text_importer/data/out/FZG/1972/03/02/a/FZG-1972-03-02-a-p0004.json b/text_importer/data/out/FZG/1972/03/02/a/FZG-1972-03-02-a-p0004.json
diff --git a/text_importer/data/out/FZG/1972/03/02/a/FZG-1972-03-02-a-p0005.json b/text_importer/data/out/FZG/1972/03/02/a/FZG-1972-03-02-a-p0005.json
diff --git a/text_importer/data/out/FZG/1972/03/02/a/FZG-1972-03-02-a-p0006.json b/text_importer/data/out/FZG/1972/03/02/a/FZG-1972-03-02-a-p0006.json
diff --git a/text_importer/data/out/FZG/1972/03/02/a/FZG-1972-03-02-a-p0007.json b/text_importer/data/out/FZG/1972/03/02/a/FZG-1972-03-02-a-p0007.json
diff --git a/text_importer/data/out/FZG/1972/03/02/a/FZG-1972-03-02-a-p0008.json b/text_importer/data/out/FZG/1972/03/02/a/FZG-1972-03-02-a-p0008.json
diff --git a/text_importer/data/out/FZG/1972/03/02/a/FZG-1972-03-02-a-p0009.json b/text_importer/data/out/FZG/1972/03/02/a/FZG-1972-03-02-a-p0009.json
diff --git a/text_importer/data/out/FZG/1972/03/02/a/FZG-1972-03-02-a-p0010.json b/text_importer/data/out/FZG/1972/03/02/a/FZG-1972-03-02-a-p0010.json
diff --git a/text_importer/data/out/FZG/1972/03/02/a/FZG-1972-03-02-a-p0011.json b/text_importer/data/out/FZG/1972/03/02/a/FZG-1972-03-02-a-p0011.json
diff --git a/text_importer/data/out/FZG/1972/03/02/a/FZG-1972-03-02-a-p0012.json b/text_importer/data/out/FZG/1972/03/02/a/FZG-1972-03-02-a-p0012.json
diff --git a/text_importer/data/out/FZG/1972/03/02/a/FZG-1972-03-02-a-p0013.json b/text_importer/data/out/FZG/1972/03/02/a/FZG-1972-03-02-a-p0013.json
diff --git a/text_importer/data/out/FZG/1972/03/02/a/FZG-1972-03-02-a-p0014.json b/text_importer/data/out/FZG/1972/03/02/a/FZG-1972-03-02-a-p0014.json
diff --git a/text_importer/data/out/FZG/1972/03/02/a/FZG-1972-03-02-a-p0015.json b/text_importer/data/out/FZG/1972/03/02/a/FZG-1972-03-02-a-p0015.json
diff --git a/text_importer/data/out/FZG/1972/03/02/a/FZG-1972-03-02-a-p0016.json b/text_importer/data/out/FZG/1972/03/02/a/FZG-1972-03-02-a-p0016.json
diff --git a/text_importer/data/out/LLE-1946-issues.jsonl.bz2 b/text_importer/data/out/LLE-1946-issues.jsonl.bz2
diff --git a/text_importer/data/out/LLE/1946/09/02/a/LLE-1946-09-02-a-p0001.json b/text_importer/data/out/LLE/1946/09/02/a/LLE-1946-09-02-a-p0001.json
diff --git a/text_importer/data/out/LLE/1946/09/02/a/LLE-1946-09-02-a-p0002.json b/text_importer/data/out/LLE/1946/09/02/a/LLE-1946-09-02-a-p0002.json
diff --git a/text_importer/data/out/LLE/1946/09/02/a/LLE-1946-09-02-a-p0003.json b/text_importer/data/out/LLE/1946/09/02/a/LLE-1946-09-02-a-p0003.json
diff --git a/text_importer/data/out/LLE/1946/09/02/a/LLE-1946-09-02-a-p0004.json b/text_importer/data/out/LLE/1946/09/02/a/LLE-1946-09-02-a-p0004.json
diff --git a/text_importer/data/out/LLE/1946/09/02/a/LLE-1946-09-02-a-p0005.json b/text_importer/data/out/LLE/1946/09/02/a/LLE-1946-09-02-a-p0005.json
diff --git a/text_importer/data/out/LLE/1946/09/02/a/LLE-1946-09-02-a-p0006.json b/text_importer/data/out/LLE/1946/09/02/a/LLE-1946-09-02-a-p0006.json
diff --git a/text_importer/data/out/LLE/1946/09/02/a/LLE-1946-09-02-a-p0007.json b/text_importer/data/out/LLE/1946/09/02/a/LLE-1946-09-02-a-p0007.json
diff --git a/text_importer/data/out/LLE/1946/09/02/a/LLE-1946-09-02-a-p0008.json b/text_importer/data/out/LLE/1946/09/02/a/LLE-1946-09-02-a-p0008.json
diff --git a/text_importer/data/out/LLE/1946/09/02/a/LLE-1946-09-02-a-p0009.json b/text_importer/data/out/LLE/1946/09/02/a/LLE-1946-09-02-a-p0009.json
diff --git a/text_importer/data/out/LLE/1946/09/02/a/LLE-1946-09-02-a-p0010.json b/text_importer/data/out/LLE/1946/09/02/a/LLE-1946-09-02-a-p0010.json
diff --git a/text_importer/data/out/pages/BLB-1845-12-28-a-pages.jsonl.bz2 b/text_importer/data/out/pages/BLB-1845-12-28-a-pages.jsonl.bz2
diff --git a/text_importer/data/out/pages/EZR-1840-02-21-a-pages.jsonl.bz2 b/text_importer/data/out/pages/EZR-1840-02-21-a-pages.jsonl.bz2
diff --git a/text_importer/data/out/pages/FZG-1972-03-02-a-pages.jsonl.bz2 b/text_importer/data/out/pages/FZG-1972-03-02-a-pages.jsonl.bz2
diff --git a/text_importer/data/out/pages/LLE-1946-09-02-a-pages.jsonl.bz2 b/text_importer/data/out/pages/LLE-1946-09-02-a-pages.jsonl.bz2
diff --git a/text_importer/importers/rero/classes.py b/text_importer/importers/rero/classes.py
@@ -50,10 +50,10 @@ def _find_pages(self):
             logger.critical(f"Could not find pages for {self.id}")
 
         page_file_names = [
-                file
-                for file in os.listdir(alto_path)
-                if not file.startswith('.') and '.xml' in file
-                ]
+            file
+            for file in os.listdir(alto_path)
+            if not file.startswith('.') and '.xml' in file
+            ]
 
         page_numbers = []
 
@@ -62,9 +62,9 @@ def _find_pages(self):
             page_numbers.append(int(page_no))
 
         page_canonical_names = [
-                "{}-p{}".format(self.id, str(page_n).zfill(4))
-                for page_n in page_numbers
-                ]
+            "{}-p{}".format(self.id, str(page_n).zfill(4))
+            for page_n in page_numbers
+            ]
 
         self.pages = []
         for filename, page_no, page_id in zip(
@@ -108,11 +108,11 @@ def _parse_content_parts(self, content_div) -> List[Dict[str, str]]:
 
                     parts.append(
                             {
-                                    'comp_role': comp_role,
-                                    'comp_id': comp_id,
-                                    'comp_fileid': comp_fileid,
-                                    'comp_page_no': comp_page_no
-                                    }
+                                'comp_role': comp_role,
+                                'comp_id': comp_id,
+                                'comp_fileid': comp_fileid,
+                                'comp_page_no': comp_page_no
+                                }
                             )
         return parts
 
@@ -138,23 +138,26 @@ def _parse_content_item(self, item_div, counter: int):
             logger.warning(f"Found new content item type: {div_type}")
 
         metadata = {
-                'id': "{}-i{}".format(self.id, str(counter).zfill(4)),
-                'tp': div_type,
-                'pp': [],
-                't': item_div.get('LABEL')
-                }
+            'id': "{}-i{}".format(self.id, str(counter).zfill(4)),
+            'tp': div_type,
+            'pp': [],
+            't': item_div.get('LABEL')
+            }
 
         content_item = {
-                "m": metadata,
-                "l": {
-                        "id": item_div.get('ID'),
-                        "parts": self._parse_content_parts(item_div)
-                        }
+            "m": metadata,
+            "l": {
+                "id": item_div.get('ID'),
+                "parts": self._parse_content_parts(item_div)
                 }
+            }
         for p in content_item['l']['parts']:
             pge_no = p["comp_page_no"]
             if pge_no not in content_item['m']['pp']:
                 content_item['m']['pp'].append(pge_no)
+
+        if content_item['m']['tp'] == CONTENTITEM_TYPE_IMAGE:
+            content_item['m']['c'], content_item['iiif_link'] = self._get_image_info(content_item['l']['parts'])
         return content_item
 
     def _parse_content_items(self, mets_doc: BeautifulSoup):
@@ -197,9 +200,32 @@ def _parse_mets(self):
         content_items = self._parse_content_items(mets_doc)
 
         self.issue_data = {
-                "cdt": strftime("%Y-%m-%d %H:%M:%S"),
-                "i": content_items,
-                "id": self.id,
-                "ar": self.rights,
-                "pp": [p.id for p in self.pages]
-                }
+            "cdt": strftime("%Y-%m-%d %H:%M:%S"),
+            "i": content_items,
+            "id": self.id,
+            "ar": self.rights,
+            "pp": [p.id for p in self.pages]
+            }
+
+    def _get_image_info(self, parts):
+        # Fetch the legacy parts
+
+        assert len(parts) == 1, "Image has more than 1 part"
+        part = parts[0]
+
+        # Fetch page number and corresponding page
+        pge_nb = part['comp_page_no']
+        comp_id = part['comp_id']
+        page = [p for p in self.pages if p.number == pge_nb][0]
+
+        elements = page.xml.findAll("TextBlock", {"ID": comp_id})
+        assert len(elements) <= 1, "Image comp_id matches multiple TextBlock tags"
+        if len(elements) == 0:
+            return []
+
+        element = elements[0]
+        hpos, vpos, width, height = element.get('HPOS'), element.get('VPOS'), element.get('WIDTH'), element.get('HEIGHT')
+        coords = [int(hpos), int(vpos), int(width), int(height)]
+        iiif_link = os.path.join(IIIF_ENDPOINT_URL, page.id, ",".join([str(x) for x in coords]), 'full', '0', 'default.jpg')
+
+        return coords, iiif_link