diff --git a/rag/app/naive.py b/rag/app/naive.py index 54d10e46351..bd507ffe10e 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -16,14 +16,15 @@ from timeit import default_timer as timer import re from deepdoc.parser.pdf_parser import PlainParser -from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec, concat_img, naive_merge_docx, tokenize_chunks_docx +from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec, concat_img, \ + naive_merge_docx, tokenize_chunks_docx from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser, MarkdownParser, TxtParser from rag.settings import cron_logger from rag.utils import num_tokens_from_string from PIL import Image from functools import reduce from markdown import markdown -from docx.image.exceptions import UnrecognizedImageError +from docx.image.exceptions import UnrecognizedImageError, UnexpectedEndOfFileError, InvalidImageStreamError class Docx(DocxParser): @@ -42,6 +43,12 @@ def get_picture(self, document, paragraph): except UnrecognizedImageError: print("Unrecognized image format. Skipping image.") return None + except UnexpectedEndOfFileError: + print("EOF was unexpectedly encountered while reading an image stream. Skipping image.") + return None + except InvalidImageStreamError: + print("The recognized image stream appears to be corrupted. Skipping image.") + return None try: image = Image.open(BytesIO(image_blob)).convert('RGB') return image @@ -101,7 +108,7 @@ def __call__(self, filename, binary=None, from_page=0, to_page=100000): while i < len(r.cells): span = 1 c = r.cells[i] - for j in range(i+1, len(r.cells)): + for j in range(i + 1, len(r.cells)): if c.text == r.cells[j].text: span += 1 i = j @@ -136,9 +143,9 @@ def __call__(self, filename, binary=None, from_page=0, self._text_merge() callback(0.67, "Text merging finished") tbls = self._extract_table_figure(True, zoomin, True, True) - #self._naive_vertical_merge() + # self._naive_vertical_merge() self._concat_downward() - #self._filter_forpages() + # self._filter_forpages() cron_logger.info("layouts: {}".format(timer() - start)) return [(b["text"], self._line_tag(b, zoomin)) @@ -158,8 +165,8 @@ def __call__(self, filename, binary=None): tbls = [] for sec in remainder.split("\n"): if num_tokens_from_string(sec) > 10 * self.chunk_token_num: - sections.append((sec[:int(len(sec)/2)], "")) - sections.append((sec[int(len(sec)/2):], "")) + sections.append((sec[:int(len(sec) / 2)], "")) + sections.append((sec[int(len(sec) / 2):], "")) else: sections.append((sec, "")) print(tables) @@ -191,7 +198,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, if re.search(r"\.docx$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") sections, tbls = Docx()(filename, binary) - res = tokenize_table(tbls, doc, eng) # just for table + res = tokenize_table(tbls, doc, eng) # just for table callback(0.8, "Finish parsing.") st = timer() @@ -229,7 +236,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, parser_config.get("chunk_token_num", 128), parser_config.get("delimiter", "\n!?;。;!?")) callback(0.8, "Finish parsing.") - + elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") sections, tbls = Markdown(int(parser_config.get("chunk_token_num", 128)))(filename, binary) @@ -276,7 +283,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, if __name__ == "__main__": import sys + def dummy(prog=None, msg=""): pass + chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)