Skip to content

Commit

Permalink
Embed img if it is jpeg
Browse files Browse the repository at this point in the history
  • Loading branch information
Hiromu Hota committed Oct 22, 2020
1 parent 94e1ad8 commit 519f38c
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 13 deletions.
32 changes: 22 additions & 10 deletions pdftotree/TreeExtract.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
import html
import logging
import os
from base64 import b64encode
from functools import cmp_to_key
from typing import Any, Dict, List, Optional, Tuple
from xml.dom.minidom import Document, Element

import numpy as np
import tabula
from pdfminer.layout import LAParams, LTChar, LTPage, LTTextLine
from pdfminer.layout import LAParams, LTChar, LTImage, LTPage, LTTextLine
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfpage import PDFPage
Expand All @@ -16,7 +17,7 @@

from pdftotree._version import __version__
from pdftotree.ml.features import get_lines_features, get_mentions_within_bbox
from pdftotree.utils.bbox_utils import get_rectangles
from pdftotree.utils.bbox_utils import bbox2str, get_rectangles
from pdftotree.utils.lines_utils import (
extend_horizontal_lines,
extend_vertical_lines,
Expand Down Expand Up @@ -322,12 +323,29 @@ def get_html_tree(self) -> str:
table_element = self.get_html_table(table, page_num)
page.appendChild(table_element)
elif box[0] == "figure":
elems: List[LTTextLine] = get_mentions_within_bbox(
box, self.elems[page_num].figures
)
fig_element = doc.createElement("figure")
page.appendChild(fig_element)
top, left, bottom, right = [int(i) for i in box[1:]]
fig_element.setAttribute(
"title", f"bbox {left} {top} {right} {bottom}"
)
for img in [img for elem in elems for img in elem]:
if not isinstance(img, LTImage):
continue
data = img.stream.get_rawdata()
base64 = b64encode(data).decode("ascii")
if data.startswith(b"\xff\xd8\xff"):
img_element = doc.createElement("img")
fig_element.appendChild(img_element)
img_element.setAttribute("title", bbox2str(img.bbox))
img_element.setAttribute(
"src", f"data:image/jpeg;base64,{base64}"
)
else:
logger.warning(f"Skipping an image of unknown type: {img}.")
else:
element = self.get_html_others(box[0], box[1:], page_num)
page.appendChild(element)
Expand Down Expand Up @@ -393,10 +411,7 @@ def get_html_others(self, tag: str, box: List[float], page_num: int) -> Element:
line_element = self.doc.createElement("span")
element.appendChild(line_element)
line_element.setAttribute("class", "ocrx_line")
line_element.setAttribute(
"title",
f"bbox {int(elem.x0)} {int(elem.y0)} {int(elem.x1)} {int(elem.y1)}",
)
line_element.setAttribute("title", bbox2str(elem.bbox))
words = self.get_word_boundaries(elem)
for word in words:
top, left, bottom, right = [int(x) for x in word[1:]]
Expand Down Expand Up @@ -460,10 +475,7 @@ def get_html_table(self, table: List[float], page_num) -> Optional[Element]:
line_element = self.doc.createElement("span")
cell_element.appendChild(line_element)
line_element.setAttribute("class", "ocrx_line")
line_element.setAttribute(
"title",
" ".join(["bbox"] + [str(int(_)) for _ in elem.bbox]),
)
line_element.setAttribute("title", bbox2str(elem.bbox))
words = self.get_word_boundaries(elem)
for word in words:
top = int(word[1])
Expand Down
6 changes: 3 additions & 3 deletions pdftotree/ml/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from collections import defaultdict
from typing import Any, List

from pdfminer.layout import LTTextLine
from pdfminer.layout import LTComponent, LTTextLine

from pdftotree.utils.bbox_utils import isContained
from pdftotree.utils.pdf.pdf_parsers import (
Expand Down Expand Up @@ -36,8 +36,8 @@ def get_height_coverage(bbox):


def get_mentions_within_bbox(
bbox: List[Any], mentions: List[LTTextLine]
) -> List[LTTextLine]:
bbox: List[Any], mentions: List[LTComponent]
) -> List[LTComponent]:
"""Get textlines within bbox.
:param bbox: a list containing (top, left, bottom, right) in the last 4 digits
Expand Down
12 changes: 12 additions & 0 deletions pdftotree/utils/bbox_utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from typing import Tuple

TOLERANCE = 5


Expand Down Expand Up @@ -147,3 +149,13 @@ def compute_iou(bbox1, bbox2):
)
return float(intersection) / float(union)
return 0.0


def bbox2str(bbox: Tuple[float, float, float, float]) -> str:
"""Return a string representation suited for hOCR.
:param bbox: a bounding box (left, top, right, bottom)
:return: a string representation for hOCR
"""
(x0, y0, x1, y1) = bbox
return f"bbox {int(x0)} {int(y0)} {int(x1)} {int(y1)}"

0 comments on commit 519f38c

Please sign in to comment.