We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
--------------------------------------------------------------------------- AssertionError Traceback (most recent call last) Cell In[8], line 10 8 jso_useful_key = {"_pdf_type": "", "model_list": []} 9 pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer) ---> 10 pipe.pipe_classify() 11 pipe.pipe_analyze() 12 pipe.pipe_parse() File ~/.local/lib/python3.10/site-packages/magic_pdf/pipe/UNIPipe.py:26, in UNIPipe.pipe_classify(self) 25 def pipe_classify(self): ---> 26 self.pdf_type = AbsPipe.classify(self.pdf_bytes) File ~/.local/lib/python3.10/site-packages/magic_pdf/pipe/AbsPipe.py:66, in AbsPipe.classify(pdf_bytes) 61 @staticmethod 62 def classify(pdf_bytes: bytes) -> str: 63 """ 64 根据pdf的元数据,判断是文本pdf,还是ocr pdf 65 """ ---> 66 pdf_meta = pdf_meta_scan(pdf_bytes) 67 if pdf_meta.get("_need_drop", False): # 如果返回了需要丢弃的标志,则抛出异常 68 raise Exception(f"pdf meta_scan need_drop,reason is {pdf_meta['_drop_reason']}") File ~/.local/lib/python3.10/site-packages/magic_pdf/filter/pdf_meta_scan.py:339, in pdf_meta_scan(pdf_bytes) 337 text_language = get_language(doc) 338 # logger.info(f"text_language: {text_language}") --> 339 invalid_chars = check_invalid_chars(pdf_bytes) 340 # logger.info(f"invalid_chars: {invalid_chars}") 341 342 # 最后输出一条json 343 res = { 344 "is_needs_password": is_needs_password, 345 "is_encrypted": is_encrypted, (...) 357 "metadata": doc.metadata 358 } File ~/.local/lib/python3.10/site-packages/magic_pdf/filter/pdf_meta_scan.py:305, in check_invalid_chars(pdf_bytes) 301 def check_invalid_chars(pdf_bytes): 302 """ 303 乱码检测 304 """ --> 305 return detect_invalid_chars(pdf_bytes) File ~/.local/lib/python3.10/site-packages/magic_pdf/libs/pdf_check.py:44, in detect_invalid_chars(src_pdf_bytes) 42 sample_pdf_bytes = sample_docs.tobytes() 43 sample_pdf_file_like_object = BytesIO(sample_pdf_bytes) ---> 44 text = extract_text(sample_pdf_file_like_object) 45 text = text.replace("\n", "") 46 # logger.info(text) File ~/.local/lib/python3.10/site-packages/pdfminer/high_level.py:175, in extract_text(pdf_file, password, page_numbers, maxpages, caching, codec, laparams) 166 interpreter = PDFPageInterpreter(rsrcmgr, device) 168 for page in PDFPage.get_pages( 169 fp, 170 page_numbers, (...) 173 caching=caching, 174 ): --> 175 interpreter.process_page(page) 177 return output_string.getvalue() File ~/.local/lib/python3.10/site-packages/pdfminer/pdfinterp.py:997, in PDFPageInterpreter.process_page(self, page) 995 ctm = (1, 0, 0, 1, -x0, -y0) 996 self.device.begin_page(page, ctm) --> 997 self.render_contents(page.resources, page.contents, ctm=ctm) 998 self.device.end_page(page) 999 return File ~/.local/lib/python3.10/site-packages/pdfminer/pdfinterp.py:1016, in PDFPageInterpreter.render_contents(self, resources, streams, ctm) 1014 self.init_resources(resources) 1015 self.init_state(ctm) -> 1016 self.execute(list_value(streams)) 1017 return File ~/.local/lib/python3.10/site-packages/pdfminer/pdfinterp.py:1042, in PDFPageInterpreter.execute(self, streams) 1040 log.debug("exec: %s %r", name, args) 1041 if len(args) == nargs: -> 1042 func(*args) 1043 else: 1044 log.debug("exec: %s", name) File ~/.local/lib/python3.10/site-packages/pdfminer/pdfinterp.py:972, in PDFPageInterpreter.do_Do(self, xobjid_arg) 970 resources = self.resources.copy() 971 self.device.begin_figure(xobjid, bbox, matrix) --> 972 interpreter.render_contents( 973 resources, [xobj], ctm=mult_matrix(matrix, self.ctm) 974 ) 975 self.device.end_figure(xobjid) 976 elif subtype is LITERAL_IMAGE and "Width" in xobj and "Height" in xobj: File ~/.local/lib/python3.10/site-packages/pdfminer/pdfinterp.py:1014, in PDFPageInterpreter.render_contents(self, resources, streams, ctm) 1007 """Render the content streams. 1008 1009 This method may be called recursively. 1010 """ 1011 log.debug( 1012 "render_contents: resources=%r, streams=%r, ctm=%r", resources, streams, ctm 1013 ) -> 1014 self.init_resources(resources) 1015 self.init_state(ctm) 1016 self.execute(list_value(streams)) File ~/.local/lib/python3.10/site-packages/pdfminer/pdfinterp.py:384, in PDFPageInterpreter.init_resources(self, resources) 382 objid = spec.objid 383 spec = dict_value(spec) --> 384 self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec) 385 elif k == "ColorSpace": 386 for (csid, spec) in dict_value(v).items(): File ~/.local/lib/python3.10/site-packages/pdfminer/pdfinterp.py:234, in PDFResourceManager.get_font(self, objid, spec) 232 if k in spec: 233 subspec[k] = resolve1(spec[k]) --> 234 font = self.get_font(None, subspec) 235 else: 236 if settings.STRICT: File ~/.local/lib/python3.10/site-packages/pdfminer/pdfinterp.py:225, in PDFResourceManager.get_font(self, objid, spec) 222 font = PDFType3Font(self, spec) 223 elif subtype in ("CIDFontType0", "CIDFontType2"): 224 # CID Font --> 225 font = PDFCIDFont(self, spec) 226 elif subtype == "Type0": 227 # Type0 Font 228 dfonts = list_value(spec["DescendantFonts"]) File ~/.local/lib/python3.10/site-packages/pdfminer/pdffont.py:1097, in PDFCIDFont.__init__(self, rsrcmgr, spec, strict) 1095 if ttf: 1096 try: -> 1097 self.unicode_map = ttf.create_unicode_map() 1098 except TrueTypeFont.CMapNotFound: 1099 pass File ~/.local/lib/python3.10/site-packages/pdfminer/pdffont.py:830, in TrueTypeFont.create_unicode_map(self) 828 char2gid[c] = (c + idd) & 0xFFFF 829 else: --> 830 assert False, str(("Unhandled", fmttype)) 831 if not char2gid: 832 raise TrueTypeFont.CMapNotFound AssertionError: ('Unhandled', 12)
基于捕食者-猎物模型的突发事件下供应链稳定性研究.pdf
Linux
3.10
0.8.x
cuda
The text was updated successfully, but these errors were encountered:
已通过魔改detect_invalid_chars函数的方式修复~
Sorry, something went wrong.
No branches or pull requests
Description of the bug | 错误描述
How to reproduce the bug | 如何复现
基于捕食者-猎物模型的突发事件下供应链稳定性研究.pdf
Operating system | 操作系统
Linux
Python version | Python 版本
3.10
Software version | 软件版本 (magic-pdf --version)
0.8.x
Device mode | 设备模式
cuda
The text was updated successfully, but these errors were encountered: