From 572c0718914a92456f52cd84cee14d72babecebe Mon Sep 17 00:00:00 2001 From: Hiromu Hota Date: Mon, 12 Oct 2020 16:10:04 -0700 Subject: [PATCH 01/10] Add type hints --- pdftotree/utils/pdf/node.py | 3 ++- pdftotree/utils/pdf/pdf_parsers.py | 24 +++++++++++++++--------- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/pdftotree/utils/pdf/node.py b/pdftotree/utils/pdf/node.py index a56f12a..f8abcab 100644 --- a/pdftotree/utils/pdf/node.py +++ b/pdftotree/utils/pdf/node.py @@ -5,6 +5,7 @@ """ import numbers from collections import Counter +from typing import List, Union from pdfminer.layout import LTComponent, LTCurve, LTFigure, LTLine, LTTextLine @@ -31,7 +32,7 @@ class Node(LTComponent): Also holds its data and features. """ - def __init__(self, elems): + def __init__(self, elems: Union[List[LTComponent], LTComponent]): """ Constructor """ diff --git a/pdftotree/utils/pdf/pdf_parsers.py b/pdftotree/utils/pdf/pdf_parsers.py index 37dfffc..800b560 100644 --- a/pdftotree/utils/pdf/pdf_parsers.py +++ b/pdftotree/utils/pdf/pdf_parsers.py @@ -12,7 +12,7 @@ from functools import cmp_to_key from typing import Any, Dict, List, Tuple -from pdfminer.layout import LTTextLine +from pdfminer.layout import LTFigure, LTTextLine from pdfminer.utils import Plane from pdftotree.utils.pdf.node import Node @@ -734,7 +734,7 @@ def parse_tree_structure( ) -> Tuple[Dict[str, Any], bool]: boxes_segments = elems.segments boxes_curves = elems.curves - boxes_figures = elems.figures + boxes_figures: List[LTFigure] = elems.figures page_width = elems.layout.width page_height = elems.layout.height mentions: List[LTTextLine] = elems.mentions @@ -1183,7 +1183,14 @@ def extract_text_candidates( return tree, new_ref_page_seen -def get_figures(boxes, page_bbox, page_num, boxes_figures, page_width, page_height): +def get_figures( + boxes: List[LTTextLine], + page_bbox: Tuple[float, float, float, float], + page_num: int, + boxes_figures: List[LTFigure], + page_width: float, + page_height: float, +) -> List[Tuple[int, int, int, float, float, float, float]]: # Filter out boxes with zero width or height filtered_boxes = [] for bbox in boxes: @@ -1198,11 +1205,8 @@ def get_figures(boxes, page_bbox, page_num, boxes_figures, page_width, page_heig plane = Plane(page_bbox) plane.extend(boxes) - nodes_figures = [] - - for fig_box in boxes_figures: - node_fig = Node(fig_box) - nodes_figures.append(node_fig) + # Convert LTFigure to Node + nodes_figures: List[Node] = [Node(fig_box) for fig_box in boxes_figures] merge_indices = [i for i in range(len(nodes_figures))] page_stat = Node(boxes) @@ -1221,7 +1225,9 @@ def get_figures(boxes, page_bbox, page_num, boxes_figures, page_width, page_heig return figures -def merge_nodes(nodes, plane, page_stat, merge_indices): +def merge_nodes( + nodes: List[Node], plane: Plane, page_stat: Node, merge_indices: List[int] +) -> Tuple[List[Node], List[int]]: """ Merges overlapping nodes """ From 75bc8d6fc9af33c8dcf73ddc3254aac549747faf Mon Sep 17 00:00:00 2001 From: Hiromu Hota Date: Wed, 21 Oct 2020 13:37:06 -0700 Subject: [PATCH 02/10] Remove unused arguments from merge_nodes() --- pdftotree/utils/pdf/pdf_parsers.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/pdftotree/utils/pdf/pdf_parsers.py b/pdftotree/utils/pdf/pdf_parsers.py index 800b560..b2f04ba 100644 --- a/pdftotree/utils/pdf/pdf_parsers.py +++ b/pdftotree/utils/pdf/pdf_parsers.py @@ -547,8 +547,7 @@ def cluster_vertically_aligned_boxes( nodes = [Node(elems) for elems in clusters] node_indices = [i for i, x in enumerate(cid2obj2) if x] merge_indices = [i for i in range(len(node_indices))] - page_stat = Node(boxes) - nodes, merge_indices = merge_nodes(nodes, plane, page_stat, merge_indices) + nodes, merge_indices = merge_nodes(nodes, merge_indices) # Features for idx in range(len(merge_indices)): if merge_indices[idx] != idx: @@ -1033,8 +1032,7 @@ def extract_text_candidates( nodes = [Node(elems) for elems in clusters] node_indices = [i for i, x in enumerate(cid2obj) if x] merge_indices = [i for i in range(len(node_indices))] - page_stat = Node(boxes) - nodes, merge_indices = merge_nodes(nodes, plane, page_stat, merge_indices) + nodes, merge_indices = merge_nodes(nodes, merge_indices) # Merging Nodes new_nodes = [] @@ -1209,8 +1207,7 @@ def get_figures( nodes_figures: List[Node] = [Node(fig_box) for fig_box in boxes_figures] merge_indices = [i for i in range(len(nodes_figures))] - page_stat = Node(boxes) - nodes, merge_indices = merge_nodes(nodes_figures, plane, page_stat, merge_indices) + nodes, merge_indices = merge_nodes(nodes_figures, merge_indices) # Merging Nodes new_nodes = [] @@ -1226,7 +1223,7 @@ def get_figures( def merge_nodes( - nodes: List[Node], plane: Plane, page_stat: Node, merge_indices: List[int] + nodes: List[Node], merge_indices: List[int] ) -> Tuple[List[Node], List[int]]: """ Merges overlapping nodes From c6eaa187cf199c4a06bbb89e5f0d3c591a441a3c Mon Sep 17 00:00:00 2001 From: Hiromu Hota Date: Wed, 21 Oct 2020 13:52:28 -0700 Subject: [PATCH 03/10] Change args of get_figures() --- pdftotree/utils/pdf/pdf_parsers.py | 36 +++++++++++------------------- 1 file changed, 13 insertions(+), 23 deletions(-) diff --git a/pdftotree/utils/pdf/pdf_parsers.py b/pdftotree/utils/pdf/pdf_parsers.py index b2f04ba..9789d7e 100644 --- a/pdftotree/utils/pdf/pdf_parsers.py +++ b/pdftotree/utils/pdf/pdf_parsers.py @@ -762,9 +762,13 @@ def parse_tree_structure( m.feats[prefix + "yc"] = m.yc_grid = m.yc // grid_size # Figures for this page - figures_page = get_figures( - mentions, elems.layout.bbox, page_num, boxes_figures, page_width, page_height - ) + nodes = get_figures(boxes_figures) + if len(nodes) == 0: + logger.warning("No boxes to get figures from on page {}.".format(page_num)) + figures_page: Tuple[int, int, int, float, float, float, float] = [ + (page_num, page_width, page_height) + (node.y0, node.x0, node.y1, node.x1) + for node in nodes + ] # Eliminate tables from these boxes boxes: List[LTTextLine] = [] @@ -1182,34 +1186,20 @@ def extract_text_candidates( def get_figures( - boxes: List[LTTextLine], - page_bbox: Tuple[float, float, float, float], - page_num: int, - boxes_figures: List[LTFigure], - page_width: float, - page_height: float, -) -> List[Tuple[int, int, int, float, float, float, float]]: + boxes: List[LTFigure], +) -> List[Node]: # Filter out boxes with zero width or height - filtered_boxes = [] - for bbox in boxes: - if bbox.x1 - bbox.x0 > 0 and bbox.y1 - bbox.y0 > 0: - filtered_boxes.append(bbox) - boxes = filtered_boxes + boxes = [bbox for bbox in boxes if not bbox.is_empty()] if len(boxes) == 0: - logger.warning("No boxes to get figures from on page {}.".format(page_num)) return [] - plane = Plane(page_bbox) - plane.extend(boxes) - # Convert LTFigure to Node - nodes_figures: List[Node] = [Node(fig_box) for fig_box in boxes_figures] + nodes_figures: List[Node] = [Node(fig_box) for fig_box in boxes] - merge_indices = [i for i in range(len(nodes_figures))] + # Merge and retain only the most outer nodes + merge_indices = list(range(len(nodes_figures))) nodes, merge_indices = merge_nodes(nodes_figures, merge_indices) - - # Merging Nodes new_nodes = [] for idx in range(len(merge_indices)): if merge_indices[idx] == idx: From c5be5199e6721c3bb3a0515a92772ca849ad38cb Mon Sep 17 00:00:00 2001 From: Hiromu Hota Date: Wed, 21 Oct 2020 13:54:27 -0700 Subject: [PATCH 04/10] Remove unused return value from merge_nodes() --- pdftotree/utils/pdf/pdf_parsers.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/pdftotree/utils/pdf/pdf_parsers.py b/pdftotree/utils/pdf/pdf_parsers.py index 9789d7e..7fa9ae5 100644 --- a/pdftotree/utils/pdf/pdf_parsers.py +++ b/pdftotree/utils/pdf/pdf_parsers.py @@ -547,7 +547,7 @@ def cluster_vertically_aligned_boxes( nodes = [Node(elems) for elems in clusters] node_indices = [i for i, x in enumerate(cid2obj2) if x] merge_indices = [i for i in range(len(node_indices))] - nodes, merge_indices = merge_nodes(nodes, merge_indices) + merge_indices = merge_nodes(nodes, merge_indices) # Features for idx in range(len(merge_indices)): if merge_indices[idx] != idx: @@ -1036,7 +1036,7 @@ def extract_text_candidates( nodes = [Node(elems) for elems in clusters] node_indices = [i for i, x in enumerate(cid2obj) if x] merge_indices = [i for i in range(len(node_indices))] - nodes, merge_indices = merge_nodes(nodes, merge_indices) + merge_indices = merge_nodes(nodes, merge_indices) # Merging Nodes new_nodes = [] @@ -1195,11 +1195,11 @@ def get_figures( return [] # Convert LTFigure to Node - nodes_figures: List[Node] = [Node(fig_box) for fig_box in boxes] + nodes: List[Node] = [Node(fig_box) for fig_box in boxes] # Merge and retain only the most outer nodes - merge_indices = list(range(len(nodes_figures))) - nodes, merge_indices = merge_nodes(nodes_figures, merge_indices) + merge_indices = list(range(len(nodes))) + merge_indices = merge_nodes(nodes, merge_indices) new_nodes = [] for idx in range(len(merge_indices)): if merge_indices[idx] == idx: @@ -1212,9 +1212,7 @@ def get_figures( return figures -def merge_nodes( - nodes: List[Node], merge_indices: List[int] -) -> Tuple[List[Node], List[int]]: +def merge_nodes(nodes: List[Node], merge_indices: List[int]) -> List[int]: """ Merges overlapping nodes """ @@ -1244,7 +1242,7 @@ def merge_nodes( for cid_iter in range(len(merge_indices)): if merge_indices[cid_iter] == merge_indices[inner_idx]: merge_indices[cid_iter] = merge_indices[best_outer_idx] - return nodes, merge_indices + return merge_indices def get_most_common_font_pts(mentions, font_stat): From 1bed2d5bafebb9e9aafd7fcef17f0c0398614543 Mon Sep 17 00:00:00 2001 From: Hiromu Hota Date: Wed, 21 Oct 2020 22:19:59 -0700 Subject: [PATCH 05/10] Add type hints and docstrings --- pdftotree/TreeExtract.py | 8 +++++--- pdftotree/utils/pdf/pdf_parsers.py | 7 +++++-- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/pdftotree/TreeExtract.py b/pdftotree/TreeExtract.py index 4eba475..7fc6487 100644 --- a/pdftotree/TreeExtract.py +++ b/pdftotree/TreeExtract.py @@ -42,7 +42,9 @@ def __init__(self, pdf_file): self.font_stats: Dict[int, Any] = {} # key represents page_num self.iou_thresh = 0.8 self.scanned = False - self.tree: Dict[int, Any] = {} # key represents page_num + self.tree: Dict[ + int, Dict[str, Tuple[int, int, int, float, float, float, float]] + ] = {} # key represents page_num def identify_scanned_page(self, boxes, page_bbox, page_width, page_height): plane = Plane(page_bbox) @@ -292,13 +294,13 @@ def get_html_tree(self) -> str: body = doc.createElement("body") html.appendChild(body) for page_num in self.elems.keys(): # 1-based - boxes = [] + boxes: List[Tuple[str, float, float, float, float]] = [] for clust in self.tree[page_num]: for (pnum, pwidth, pheight, top, left, bottom, right) in self.tree[ page_num ][clust]: boxes += [ - [clust.lower().replace(" ", "_"), top, left, bottom, right] + (clust.lower().replace(" ", "_"), top, left, bottom, right) ] page = doc.createElement("div") page.setAttribute("class", "ocr_page") diff --git a/pdftotree/utils/pdf/pdf_parsers.py b/pdftotree/utils/pdf/pdf_parsers.py index 7fa9ae5..5b57362 100644 --- a/pdftotree/utils/pdf/pdf_parsers.py +++ b/pdftotree/utils/pdf/pdf_parsers.py @@ -1213,8 +1213,11 @@ def get_figures( def merge_nodes(nodes: List[Node], merge_indices: List[int]) -> List[int]: - """ - Merges overlapping nodes + """Merges overlapping nodes. + + :param nodes: Nodes to be merged + :param merge_indices: Indices of nodes + :return: a list of indices, indicating which node is its most outer node. """ # Merge inner boxes to the best outer box # nodes.sort(key=Node.area) From ee1f41a8f0f10a15b87a1b0c051e50044d9a3cf8 Mon Sep 17 00:00:00 2001 From: Hiromu Hota Date: Wed, 21 Oct 2020 22:20:21 -0700 Subject: [PATCH 06/10] Refactor --- pdftotree/utils/pdf/pdf_parsers.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/pdftotree/utils/pdf/pdf_parsers.py b/pdftotree/utils/pdf/pdf_parsers.py index 5b57362..ea2549b 100644 --- a/pdftotree/utils/pdf/pdf_parsers.py +++ b/pdftotree/utils/pdf/pdf_parsers.py @@ -1200,16 +1200,9 @@ def get_figures( # Merge and retain only the most outer nodes merge_indices = list(range(len(nodes))) merge_indices = merge_nodes(nodes, merge_indices) - new_nodes = [] - for idx in range(len(merge_indices)): - if merge_indices[idx] == idx: - new_nodes.append(nodes[idx]) + new_nodes = [node for idx, node in enumerate(nodes) if merge_indices[idx] == idx] - figures = [ - (page_num, page_width, page_height) + (node.y0, node.x0, node.y1, node.x1) - for node in new_nodes - ] - return figures + return new_nodes def merge_nodes(nodes: List[Node], merge_indices: List[int]) -> List[int]: From 37eead85dd7f0b288180ce0c79b27fd5a165e237 Mon Sep 17 00:00:00 2001 From: Hiromu Hota Date: Wed, 21 Oct 2020 16:28:06 -0700 Subject: [PATCH 07/10] Embed img if it is jpeg --- pdftotree/TreeExtract.py | 32 ++++++++++++++++++++++---------- pdftotree/ml/features.py | 6 +++--- pdftotree/utils/bbox_utils.py | 12 ++++++++++++ 3 files changed, 37 insertions(+), 13 deletions(-) diff --git a/pdftotree/TreeExtract.py b/pdftotree/TreeExtract.py index 7fc6487..c729b8c 100644 --- a/pdftotree/TreeExtract.py +++ b/pdftotree/TreeExtract.py @@ -1,12 +1,13 @@ import logging import os +from base64 import b64encode from functools import cmp_to_key from typing import Any, Dict, List, Optional, Tuple from xml.dom.minidom import Document, Element import numpy as np import tabula -from pdfminer.layout import LAParams, LTChar, LTPage, LTTextLine +from pdfminer.layout import LAParams, LTChar, LTImage, LTPage, LTTextLine from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager from pdfminer.pdfpage import PDFPage @@ -15,7 +16,7 @@ from pdftotree._version import __version__ from pdftotree.ml.features import get_lines_features, get_mentions_within_bbox -from pdftotree.utils.bbox_utils import get_rectangles +from pdftotree.utils.bbox_utils import bbox2str, get_rectangles from pdftotree.utils.lines_utils import ( extend_horizontal_lines, extend_vertical_lines, @@ -321,12 +322,29 @@ def get_html_tree(self) -> str: table_element = self.get_html_table(table, page_num) page.appendChild(table_element) elif box[0] == "figure": + elems: List[LTTextLine] = get_mentions_within_bbox( + box, self.elems[page_num].figures + ) fig_element = doc.createElement("figure") page.appendChild(fig_element) top, left, bottom, right = [int(i) for i in box[1:]] fig_element.setAttribute( "title", f"bbox {left} {top} {right} {bottom}" ) + for img in [img for elem in elems for img in elem]: + if not isinstance(img, LTImage): + continue + data = img.stream.get_rawdata() + base64 = b64encode(data).decode("ascii") + if data.startswith(b"\xff\xd8\xff"): + img_element = doc.createElement("img") + fig_element.appendChild(img_element) + img_element.setAttribute("title", bbox2str(img.bbox)) + img_element.setAttribute( + "src", f"data:image/jpeg;base64,{base64}" + ) + else: + logger.warning(f"Skipping an image of unknown type: {img}.") else: element = self.get_html_others(box[0], box[1:], page_num) page.appendChild(element) @@ -392,10 +410,7 @@ def get_html_others(self, tag: str, box: List[float], page_num: int) -> Element: line_element = self.doc.createElement("span") element.appendChild(line_element) line_element.setAttribute("class", "ocrx_line") - line_element.setAttribute( - "title", - f"bbox {int(elem.x0)} {int(elem.y0)} {int(elem.x1)} {int(elem.y1)}", - ) + line_element.setAttribute("title", bbox2str(elem.bbox)) words = self.get_word_boundaries(elem) for word in words: top, left, bottom, right = [int(x) for x in word[1:]] @@ -458,10 +473,7 @@ def get_html_table(self, table: List[float], page_num) -> Optional[Element]: line_element = self.doc.createElement("span") cell_element.appendChild(line_element) line_element.setAttribute("class", "ocrx_line") - line_element.setAttribute( - "title", - " ".join(["bbox"] + [str(int(_)) for _ in elem.bbox]), - ) + line_element.setAttribute("title", bbox2str(elem.bbox)) words = self.get_word_boundaries(elem) for word in words: top = int(word[1]) diff --git a/pdftotree/ml/features.py b/pdftotree/ml/features.py index 0d65a4e..8f62e2c 100644 --- a/pdftotree/ml/features.py +++ b/pdftotree/ml/features.py @@ -3,7 +3,7 @@ from collections import defaultdict from typing import Any, List -from pdfminer.layout import LTTextLine +from pdfminer.layout import LTComponent, LTTextLine from pdftotree.utils.bbox_utils import isContained from pdftotree.utils.pdf.pdf_parsers import ( @@ -36,8 +36,8 @@ def get_height_coverage(bbox): def get_mentions_within_bbox( - bbox: List[Any], mentions: List[LTTextLine] -) -> List[LTTextLine]: + bbox: List[Any], mentions: List[LTComponent] +) -> List[LTComponent]: """Get textlines within bbox. :param bbox: a list containing (top, left, bottom, right) in the last 4 digits diff --git a/pdftotree/utils/bbox_utils.py b/pdftotree/utils/bbox_utils.py index 55b676e..c52a7b6 100644 --- a/pdftotree/utils/bbox_utils.py +++ b/pdftotree/utils/bbox_utils.py @@ -1,3 +1,5 @@ +from typing import Tuple + TOLERANCE = 5 @@ -147,3 +149,13 @@ def compute_iou(bbox1, bbox2): ) return float(intersection) / float(union) return 0.0 + + +def bbox2str(bbox: Tuple[float, float, float, float]) -> str: + """Return a string representation suited for hOCR. + + :param bbox: a bounding box (left, top, right, bottom) + :return: a string representation for hOCR + """ + (x0, y0, x1, y1) = bbox + return f"bbox {int(x0)} {int(y0)} {int(x1)} {int(y1)}" From 2c9158360de691022b84a6bfb13bd92f2718f370 Mon Sep 17 00:00:00 2001 From: Hiromu Hota Date: Thu, 22 Oct 2020 12:46:38 -0700 Subject: [PATCH 08/10] Add a test --- tests/test_figures.py | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 tests/test_figures.py diff --git a/tests/test_figures.py b/tests/test_figures.py new file mode 100644 index 0000000..f1baf6b --- /dev/null +++ b/tests/test_figures.py @@ -0,0 +1,11 @@ +"""Test figures.""" +from bs4 import BeautifulSoup + +import pdftotree + + +def test_figures(): + output = pdftotree.parse("tests/input/md.pdf") + soup = BeautifulSoup(output, "lxml") + imgs = soup.find_all("img") + assert len(imgs) == 1 From 7356e89233244362fc87e5fd9ef052560b0f296d Mon Sep 17 00:00:00 2001 From: Hiromu Hota Date: Thu, 22 Oct 2020 14:17:27 -0700 Subject: [PATCH 09/10] Support BMP images --- pdftotree/TreeExtract.py | 32 ++++++++++++++++++++++---------- tests/test_figures.py | 8 ++++++++ 2 files changed, 30 insertions(+), 10 deletions(-) diff --git a/pdftotree/TreeExtract.py b/pdftotree/TreeExtract.py index c729b8c..aad8b9b 100644 --- a/pdftotree/TreeExtract.py +++ b/pdftotree/TreeExtract.py @@ -1,5 +1,6 @@ import logging import os +import tempfile from base64 import b64encode from functools import cmp_to_key from typing import Any, Dict, List, Optional, Tuple @@ -7,6 +8,7 @@ import numpy as np import tabula +from pdfminer.image import ImageWriter from pdfminer.layout import LAParams, LTChar, LTImage, LTPage, LTTextLine from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager @@ -270,6 +272,10 @@ def get_tree_structure(self, model_type, model) -> Dict[str, Any]: return self.tree def get_html_tree(self) -> str: + # Create a temp folder where images are temporarily saved. + dirname = tempfile.mkdtemp() + imagewriter = ImageWriter(dirname) + doc = Document() self.doc = doc html = doc.createElement("html") @@ -334,17 +340,23 @@ def get_html_tree(self) -> str: for img in [img for elem in elems for img in elem]: if not isinstance(img, LTImage): continue - data = img.stream.get_rawdata() - base64 = b64encode(data).decode("ascii") - if data.startswith(b"\xff\xd8\xff"): - img_element = doc.createElement("img") - fig_element.appendChild(img_element) - img_element.setAttribute("title", bbox2str(img.bbox)) - img_element.setAttribute( - "src", f"data:image/jpeg;base64,{base64}" - ) + filename = imagewriter.export_image(img) + with open(os.path.join(dirname, filename), "rb") as f: + base64 = b64encode(f.read()).decode("ascii") + if filename.endswith("jpg"): + mediatype = "jpeg" + elif filename.endswith("bmp"): + mediatype = "bmp" else: - logger.warning(f"Skipping an image of unknown type: {img}.") + logger.info(f"Skipping an unknown type image: {filename}.") + continue + logger.info(f"Embedding a known type image: {filename}.") + img_element = doc.createElement("img") + fig_element.appendChild(img_element) + img_element.setAttribute("title", bbox2str(img.bbox)) + img_element.setAttribute( + "src", f"data:image/{mediatype};base64,{base64}" + ) else: element = self.get_html_others(box[0], box[1:], page_num) page.appendChild(element) diff --git a/tests/test_figures.py b/tests/test_figures.py index f1baf6b..d3bf77a 100644 --- a/tests/test_figures.py +++ b/tests/test_figures.py @@ -9,3 +9,11 @@ def test_figures(): soup = BeautifulSoup(output, "lxml") imgs = soup.find_all("img") assert len(imgs) == 1 + + output = pdftotree.parse("tests/input/CaseStudy_ACS.pdf") + soup = BeautifulSoup(output, "lxml") + imgs = soup.find_all("img") + # 3 jpg, 2 bmp, 5 total images + assert len(imgs) == 5 + assert len([img for img in imgs if img["src"].startswith("data:image/jpeg")]) == 3 + assert len([img for img in imgs if img["src"].startswith("data:image/bmp")]) == 2 From 1cfa226f1b0481b5ff62b72c51ad0f8ed184e724 Mon Sep 17 00:00:00 2001 From: Hiromu Hota Date: Thu, 22 Oct 2020 15:01:17 -0700 Subject: [PATCH 10/10] Update CHANGELOG --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 94731c1..ca8b476 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added +- Embed base64-encoded images inline. Support starting with JPEG and BMP. + ([#99](https://github.com/HazyResearch/pdftotree/pull/99), [@HiromuHota][HiromuHota]) + ### Fixed - List a missing "ocrx_line" in the ocr-capabilities metadata field. ([#94](https://github.com/HazyResearch/pdftotree/issues/94), [@HiromuHota][HiromuHota])