-
Notifications
You must be signed in to change notification settings - Fork 20
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #37 from huridocs/toc
Toc
- Loading branch information
Showing
16 changed files
with
539 additions
and
30 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
from toc.TitleFeatures import TitleFeatures | ||
from toc.PdfSegmentation import PdfSegmentation | ||
|
||
|
||
class MergeTwoSegmentsTitles: | ||
def __init__(self, pdf_segmentation: PdfSegmentation): | ||
self.title_features_list: list[TitleFeatures] = TitleFeatures.from_pdf_segmentation(pdf_segmentation) | ||
self.titles_merged: list[TitleFeatures] = list() | ||
self.merge() | ||
|
||
def merge(self): | ||
index = 0 | ||
while index < len(self.title_features_list): | ||
if index == len(self.title_features_list) - 1: | ||
self.titles_merged.append(self.title_features_list[index]) | ||
break | ||
|
||
if not self.should_merge(self.title_features_list[index], self.title_features_list[index + 1]): | ||
self.titles_merged.append(self.title_features_list[index]) | ||
index += 1 | ||
continue | ||
|
||
self.title_features_list[index + 1] = self.title_features_list[index + 1].append(self.title_features_list[index]) | ||
index += 1 | ||
|
||
@staticmethod | ||
def should_merge(title: TitleFeatures, other_title: TitleFeatures): | ||
same_page = other_title.pdf_segment.page_number == title.pdf_segment.page_number | ||
|
||
if not same_page: | ||
return False | ||
|
||
if abs(other_title.top - title.bottom) > 15: | ||
return False | ||
|
||
if abs(other_title.left - title.right) > 15 or abs(other_title.right - title.left) > 15: | ||
return False | ||
|
||
if title.first_characters_type in [1, 2, 3] and other_title.first_characters_type in [1, 2, 3]: | ||
return False | ||
|
||
if title.bullet_points_type and other_title.bullet_points_type: | ||
return False | ||
|
||
if title.get_features_to_merge() != other_title.get_features_to_merge(): | ||
return False | ||
|
||
return True |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
from fast_trainer.PdfSegment import PdfSegment | ||
from pdf_features.PdfFeatures import PdfFeatures | ||
from pdf_features.PdfToken import PdfToken | ||
|
||
|
||
class PdfSegmentation: | ||
def __init__(self, pdf_features: PdfFeatures, pdf_segments: list[PdfSegment]): | ||
self.pdf_features: PdfFeatures = pdf_features | ||
self.pdf_segments: list[PdfSegment] = pdf_segments | ||
self.tokens_by_segments: dict[PdfSegment, list[PdfToken]] = self.find_tokens_by_segments() | ||
|
||
@staticmethod | ||
def find_segment_for_token(token: PdfToken, segments: list[PdfSegment], tokens_by_segments): | ||
best_score: float = 0 | ||
most_probable_segment: PdfSegment | None = None | ||
for segment in segments: | ||
intersection_percentage = token.bounding_box.get_intersection_percentage(segment.bounding_box) | ||
if intersection_percentage > best_score: | ||
best_score = intersection_percentage | ||
most_probable_segment = segment | ||
if best_score >= 99: | ||
break | ||
if most_probable_segment: | ||
tokens_by_segments.setdefault(most_probable_segment, list()).append(token) | ||
|
||
def find_tokens_by_segments(self): | ||
tokens_by_segments: dict[PdfSegment, list[PdfToken]] = {} | ||
for page in self.pdf_features.pages: | ||
page_segments = [segment for segment in self.pdf_segments if segment.page_number == page.page_number] | ||
for token in page.tokens: | ||
self.find_segment_for_token(token, page_segments, tokens_by_segments) | ||
return tokens_by_segments |
Oops, something went wrong.