Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Embed Base64-Encoded Images Inline #99

Merged
merged 10 commits into from
Oct 26, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

### Added
- Embed base64-encoded images inline. Support starting with JPEG and BMP.
([#99](https://github.com/HazyResearch/pdftotree/pull/99), [@HiromuHota][HiromuHota])

### Fixed
- List a missing "ocrx_line" in the ocr-capabilities metadata field.
([#94](https://github.com/HazyResearch/pdftotree/issues/94), [@HiromuHota][HiromuHota])
Expand Down
52 changes: 39 additions & 13 deletions pdftotree/TreeExtract.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
import logging
import os
import tempfile
from base64 import b64encode
from functools import cmp_to_key
from typing import Any, Dict, List, Optional, Tuple
from xml.dom.minidom import Document, Element

import numpy as np
import tabula
from pdfminer.layout import LAParams, LTChar, LTPage, LTTextLine
from pdfminer.image import ImageWriter
from pdfminer.layout import LAParams, LTChar, LTImage, LTPage, LTTextLine
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfpage import PDFPage
Expand All @@ -15,7 +18,7 @@

from pdftotree._version import __version__
from pdftotree.ml.features import get_lines_features, get_mentions_within_bbox
from pdftotree.utils.bbox_utils import get_rectangles
from pdftotree.utils.bbox_utils import bbox2str, get_rectangles
from pdftotree.utils.lines_utils import (
extend_horizontal_lines,
extend_vertical_lines,
Expand All @@ -42,7 +45,9 @@ def __init__(self, pdf_file):
self.font_stats: Dict[int, Any] = {} # key represents page_num
self.iou_thresh = 0.8
self.scanned = False
self.tree: Dict[int, Any] = {} # key represents page_num
self.tree: Dict[
int, Dict[str, Tuple[int, int, int, float, float, float, float]]
] = {} # key represents page_num

def identify_scanned_page(self, boxes, page_bbox, page_width, page_height):
plane = Plane(page_bbox)
Expand Down Expand Up @@ -267,6 +272,10 @@ def get_tree_structure(self, model_type, model) -> Dict[str, Any]:
return self.tree

def get_html_tree(self) -> str:
# Create a temp folder where images are temporarily saved.
dirname = tempfile.mkdtemp()
imagewriter = ImageWriter(dirname)

doc = Document()
self.doc = doc
html = doc.createElement("html")
Expand All @@ -292,13 +301,13 @@ def get_html_tree(self) -> str:
body = doc.createElement("body")
html.appendChild(body)
for page_num in self.elems.keys(): # 1-based
boxes = []
boxes: List[Tuple[str, float, float, float, float]] = []
for clust in self.tree[page_num]:
for (pnum, pwidth, pheight, top, left, bottom, right) in self.tree[
page_num
][clust]:
boxes += [
[clust.lower().replace(" ", "_"), top, left, bottom, right]
(clust.lower().replace(" ", "_"), top, left, bottom, right)
]
page = doc.createElement("div")
page.setAttribute("class", "ocr_page")
Expand All @@ -319,12 +328,35 @@ def get_html_tree(self) -> str:
table_element = self.get_html_table(table, page_num)
page.appendChild(table_element)
elif box[0] == "figure":
elems: List[LTTextLine] = get_mentions_within_bbox(
box, self.elems[page_num].figures
)
fig_element = doc.createElement("figure")
page.appendChild(fig_element)
top, left, bottom, right = [int(i) for i in box[1:]]
fig_element.setAttribute(
"title", f"bbox {left} {top} {right} {bottom}"
)
for img in [img for elem in elems for img in elem]:
if not isinstance(img, LTImage):
continue
filename = imagewriter.export_image(img)
with open(os.path.join(dirname, filename), "rb") as f:
base64 = b64encode(f.read()).decode("ascii")
if filename.endswith("jpg"):
mediatype = "jpeg"
elif filename.endswith("bmp"):
mediatype = "bmp"
else:
logger.info(f"Skipping an unknown type image: {filename}.")
continue
logger.info(f"Embedding a known type image: {filename}.")
img_element = doc.createElement("img")
fig_element.appendChild(img_element)
img_element.setAttribute("title", bbox2str(img.bbox))
img_element.setAttribute(
"src", f"data:image/{mediatype};base64,{base64}"
)
else:
element = self.get_html_others(box[0], box[1:], page_num)
page.appendChild(element)
Expand Down Expand Up @@ -390,10 +422,7 @@ def get_html_others(self, tag: str, box: List[float], page_num: int) -> Element:
line_element = self.doc.createElement("span")
element.appendChild(line_element)
line_element.setAttribute("class", "ocrx_line")
line_element.setAttribute(
"title",
f"bbox {int(elem.x0)} {int(elem.y0)} {int(elem.x1)} {int(elem.y1)}",
)
line_element.setAttribute("title", bbox2str(elem.bbox))
words = self.get_word_boundaries(elem)
for word in words:
top, left, bottom, right = [int(x) for x in word[1:]]
Expand Down Expand Up @@ -456,10 +485,7 @@ def get_html_table(self, table: List[float], page_num) -> Optional[Element]:
line_element = self.doc.createElement("span")
cell_element.appendChild(line_element)
line_element.setAttribute("class", "ocrx_line")
line_element.setAttribute(
"title",
" ".join(["bbox"] + [str(int(_)) for _ in elem.bbox]),
)
line_element.setAttribute("title", bbox2str(elem.bbox))
words = self.get_word_boundaries(elem)
for word in words:
top = int(word[1])
Expand Down
6 changes: 3 additions & 3 deletions pdftotree/ml/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from collections import defaultdict
from typing import Any, List

from pdfminer.layout import LTTextLine
from pdfminer.layout import LTComponent, LTTextLine

from pdftotree.utils.bbox_utils import isContained
from pdftotree.utils.pdf.pdf_parsers import (
Expand Down Expand Up @@ -36,8 +36,8 @@ def get_height_coverage(bbox):


def get_mentions_within_bbox(
bbox: List[Any], mentions: List[LTTextLine]
) -> List[LTTextLine]:
bbox: List[Any], mentions: List[LTComponent]
) -> List[LTComponent]:
"""Get textlines within bbox.

:param bbox: a list containing (top, left, bottom, right) in the last 4 digits
Expand Down
12 changes: 12 additions & 0 deletions pdftotree/utils/bbox_utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from typing import Tuple

TOLERANCE = 5


Expand Down Expand Up @@ -147,3 +149,13 @@ def compute_iou(bbox1, bbox2):
)
return float(intersection) / float(union)
return 0.0


def bbox2str(bbox: Tuple[float, float, float, float]) -> str:
"""Return a string representation suited for hOCR.

:param bbox: a bounding box (left, top, right, bottom)
:return: a string representation for hOCR
"""
(x0, y0, x1, y1) = bbox
return f"bbox {int(x0)} {int(y0)} {int(x1)} {int(y1)}"
3 changes: 2 additions & 1 deletion pdftotree/utils/pdf/node.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"""
import numbers
from collections import Counter
from typing import List, Union

from pdfminer.layout import LTComponent, LTCurve, LTFigure, LTLine, LTTextLine

Expand All @@ -31,7 +32,7 @@ class Node(LTComponent):
Also holds its data and features.
"""

def __init__(self, elems):
def __init__(self, elems: Union[List[LTComponent], LTComponent]):
"""
Constructor
"""
Expand Down
69 changes: 28 additions & 41 deletions pdftotree/utils/pdf/pdf_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from functools import cmp_to_key
from typing import Any, Dict, List, Tuple

from pdfminer.layout import LTTextLine
from pdfminer.layout import LTFigure, LTTextLine
from pdfminer.utils import Plane

from pdftotree.utils.pdf.node import Node
Expand Down Expand Up @@ -547,8 +547,7 @@ def cluster_vertically_aligned_boxes(
nodes = [Node(elems) for elems in clusters]
node_indices = [i for i, x in enumerate(cid2obj2) if x]
merge_indices = [i for i in range(len(node_indices))]
page_stat = Node(boxes)
nodes, merge_indices = merge_nodes(nodes, plane, page_stat, merge_indices)
merge_indices = merge_nodes(nodes, merge_indices)
# Features
for idx in range(len(merge_indices)):
if merge_indices[idx] != idx:
Expand Down Expand Up @@ -734,7 +733,7 @@ def parse_tree_structure(
) -> Tuple[Dict[str, Any], bool]:
boxes_segments = elems.segments
boxes_curves = elems.curves
boxes_figures = elems.figures
boxes_figures: List[LTFigure] = elems.figures
page_width = elems.layout.width
page_height = elems.layout.height
mentions: List[LTTextLine] = elems.mentions
Expand Down Expand Up @@ -763,9 +762,13 @@ def parse_tree_structure(
m.feats[prefix + "yc"] = m.yc_grid = m.yc // grid_size

# Figures for this page
figures_page = get_figures(
mentions, elems.layout.bbox, page_num, boxes_figures, page_width, page_height
)
nodes = get_figures(boxes_figures)
if len(nodes) == 0:
logger.warning("No boxes to get figures from on page {}.".format(page_num))
figures_page: Tuple[int, int, int, float, float, float, float] = [
(page_num, page_width, page_height) + (node.y0, node.x0, node.y1, node.x1)
for node in nodes
]

# Eliminate tables from these boxes
boxes: List[LTTextLine] = []
Expand Down Expand Up @@ -1033,8 +1036,7 @@ def extract_text_candidates(
nodes = [Node(elems) for elems in clusters]
node_indices = [i for i, x in enumerate(cid2obj) if x]
merge_indices = [i for i in range(len(node_indices))]
page_stat = Node(boxes)
nodes, merge_indices = merge_nodes(nodes, plane, page_stat, merge_indices)
merge_indices = merge_nodes(nodes, merge_indices)

# Merging Nodes
new_nodes = []
Expand Down Expand Up @@ -1183,47 +1185,32 @@ def extract_text_candidates(
return tree, new_ref_page_seen


def get_figures(boxes, page_bbox, page_num, boxes_figures, page_width, page_height):
def get_figures(
boxes: List[LTFigure],
) -> List[Node]:
# Filter out boxes with zero width or height
filtered_boxes = []
for bbox in boxes:
if bbox.x1 - bbox.x0 > 0 and bbox.y1 - bbox.y0 > 0:
filtered_boxes.append(bbox)
boxes = filtered_boxes
boxes = [bbox for bbox in boxes if not bbox.is_empty()]

if len(boxes) == 0:
logger.warning("No boxes to get figures from on page {}.".format(page_num))
return []

plane = Plane(page_bbox)
plane.extend(boxes)
# Convert LTFigure to Node
nodes: List[Node] = [Node(fig_box) for fig_box in boxes]

nodes_figures = []
# Merge and retain only the most outer nodes
merge_indices = list(range(len(nodes)))
merge_indices = merge_nodes(nodes, merge_indices)
new_nodes = [node for idx, node in enumerate(nodes) if merge_indices[idx] == idx]

for fig_box in boxes_figures:
node_fig = Node(fig_box)
nodes_figures.append(node_fig)

merge_indices = [i for i in range(len(nodes_figures))]
page_stat = Node(boxes)
nodes, merge_indices = merge_nodes(nodes_figures, plane, page_stat, merge_indices)

# Merging Nodes
new_nodes = []
for idx in range(len(merge_indices)):
if merge_indices[idx] == idx:
new_nodes.append(nodes[idx])
return new_nodes

figures = [
(page_num, page_width, page_height) + (node.y0, node.x0, node.y1, node.x1)
for node in new_nodes
]
return figures

def merge_nodes(nodes: List[Node], merge_indices: List[int]) -> List[int]:
"""Merges overlapping nodes.

def merge_nodes(nodes, plane, page_stat, merge_indices):
"""
Merges overlapping nodes
:param nodes: Nodes to be merged
:param merge_indices: Indices of nodes
:return: a list of indices, indicating which node is its most outer node.
"""
# Merge inner boxes to the best outer box
# nodes.sort(key=Node.area)
Expand Down Expand Up @@ -1251,7 +1238,7 @@ def merge_nodes(nodes, plane, page_stat, merge_indices):
for cid_iter in range(len(merge_indices)):
if merge_indices[cid_iter] == merge_indices[inner_idx]:
merge_indices[cid_iter] = merge_indices[best_outer_idx]
return nodes, merge_indices
return merge_indices


def get_most_common_font_pts(mentions, font_stat):
Expand Down
19 changes: 19 additions & 0 deletions tests/test_figures.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
"""Test figures."""
from bs4 import BeautifulSoup

import pdftotree


def test_figures():
output = pdftotree.parse("tests/input/md.pdf")
soup = BeautifulSoup(output, "lxml")
imgs = soup.find_all("img")
assert len(imgs) == 1

output = pdftotree.parse("tests/input/CaseStudy_ACS.pdf")
soup = BeautifulSoup(output, "lxml")
imgs = soup.find_all("img")
# 3 jpg, 2 bmp, 5 total images
assert len(imgs) == 5
assert len([img for img in imgs if img["src"].startswith("data:image/jpeg")]) == 3
assert len([img for img in imgs if img["src"].startswith("data:image/bmp")]) == 2