From 03a477f7f0b3016dd38d00f9e24d0cc5925d5a04 Mon Sep 17 00:00:00 2001 From: Jeremy Singer-Vine Date: Sun, 4 Aug 2024 14:07:55 -0400 Subject: [PATCH] Add pdfplumber.open(unicode_norm=...) Allows user to pre-normalize Unicode characters. h/t @petermr + @agusluques in #905 --- CHANGELOG.md | 1 + README.md | 2 ++ pdfplumber/page.py | 8 +++++++- pdfplumber/pdf.py | 6 +++++- tests/pdfs/issue-905.pdf | Bin 0 -> 13656 bytes tests/test_basics.py | 13 +++++++++++++ 6 files changed, 28 insertions(+), 2 deletions(-) create mode 100644 tests/pdfs/issue-905.pdf diff --git a/CHANGELOG.md b/CHANGELOG.md index 3ba79a3b..ea93cff3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ All notable changes to this project will be documented in this file. The format - Add `Table.columns`, analogous to `Table.rows` (h/t @Pk13055). ([#1050](https://github.com/jsvine/pdfplumber/issues/1050)) - Add `Page.extract_words(return_chars=True)`, mirroring `Page.search(..., return_chars=True)`; if this argument is passed, each word dictionary will include an additional key-value pair: `"chars": [char_object, ...]` (h/t @cmdlineluser). ([#1173](https://github.com/jsvine/pdfplumber/issues/1173)) +- Add `pdfplumber.open(unicode_norm="NFC"/"NFD"/"NFKC"/NFKD")`, where the values are the [four options for Unicode normalization](https://unicode.org/reports/tr15/#Normalization_Forms_Table) (h/t @petermr + @agusluques). ([#905](https://github.com/jsvine/pdfplumber/issues/905)) ### Changed diff --git a/README.md b/README.md index 6a959436..edc21746 100644 --- a/README.md +++ b/README.md @@ -81,6 +81,8 @@ To load a password-protected PDF, pass the `password` keyword argument, e.g., `p To set layout analysis parameters to `pdfminer.six`'s layout engine, pass the `laparams` keyword argument, e.g., `pdfplumber.open("file.pdf", laparams = { "line_overlap": 0.7 })`. +To [pre-normalize Unicode text](https://unicode.org/reports/tr15/), pass `unicode_norm=...`, where `...` is one of the [four Unicode normalization forms](https://unicode.org/reports/tr15/#Normalization_Forms_Table): `"NFC"`, `"NFD"`, `"NFKC"`, or `"NFKD"`. + Invalid metadata values are treated as a warning by default. If that is not intended, pass `strict_metadata=True` to the `open` method and `pdfplumber.open` will raise an exception if it is unable to parse the metadata. ### The `pdfplumber.PDF` class diff --git a/pdfplumber/page.py b/pdfplumber/page.py index e9ae725e..d8721282 100644 --- a/pdfplumber/page.py +++ b/pdfplumber/page.py @@ -12,6 +12,7 @@ Tuple, Union, ) +from unicodedata import normalize as normalize_unicode from pdfminer.converter import PDFPageAggregator from pdfminer.layout import ( @@ -382,7 +383,12 @@ def process_attr(item: Tuple[str, Any]) -> Optional[Tuple[str, Any]]: attr[color_attr], attr[pattern_attr] = normalize_color(attr[color_attr]) if isinstance(obj, (LTChar, LTTextContainer)): - attr["text"] = obj.get_text() + text = obj.get_text() + attr["text"] = ( + normalize_unicode(self.pdf.unicode_norm, text) + if self.pdf.unicode_norm is not None + else text + ) if isinstance(obj, LTChar): # pdfminer.six (at least as of v20221105) does not diff --git a/pdfplumber/pdf.py b/pdfplumber/pdf.py index bb26b106..9b6ea71f 100644 --- a/pdfplumber/pdf.py +++ b/pdfplumber/pdf.py @@ -3,7 +3,7 @@ import pathlib from io import BufferedReader, BytesIO from types import TracebackType -from typing import Any, Dict, List, Optional, Tuple, Type, Union +from typing import Any, Dict, List, Literal, Optional, Tuple, Type, Union from pdfminer.layout import LAParams from pdfminer.pdfdocument import PDFDocument @@ -34,6 +34,7 @@ def __init__( laparams: Optional[Dict[str, Any]] = None, password: Optional[str] = None, strict_metadata: bool = False, + unicode_norm: Optional[Literal["NFC", "NFKC", "NFD", "NFKD"]] = None, ): self.stream = stream self.stream_is_external = stream_is_external @@ -41,6 +42,7 @@ def __init__( self.pages_to_parse = pages self.laparams = None if laparams is None else LAParams(**laparams) self.password = password + self.unicode_norm = unicode_norm self.doc = PDFDocument(PDFParser(stream), password=password or "") self.rsrcmgr = PDFResourceManager() @@ -70,6 +72,7 @@ def open( laparams: Optional[Dict[str, Any]] = None, password: Optional[str] = None, strict_metadata: bool = False, + unicode_norm: Optional[Literal["NFC", "NFKC", "NFD", "NFKD"]] = None, repair: bool = False, gs_path: Optional[Union[str, pathlib.Path]] = None, repair_setting: T_repair_setting = "default", @@ -102,6 +105,7 @@ def open( laparams=laparams, password=password, strict_metadata=strict_metadata, + unicode_norm=unicode_norm, stream_is_external=stream_is_external, ) diff --git a/tests/pdfs/issue-905.pdf b/tests/pdfs/issue-905.pdf new file mode 100644 index 0000000000000000000000000000000000000000..02d5a469382cc9c28c116786878476a82f58d266 GIT binary patch literal 13656 zcmd73WmH^C*RG8dAV_ctp>fy72@u@fU7E&%yA#}+;BLVZ+!GvvH4ec81PH<1C0I_g z<=J~b`#s-#etta|G;4OPu30tLVBB|IRkW&-(yW|pKy=#Pjnj?24`=Vb^g_{r01kko zxh?w3mjHHY8#_0!3+&y_%nd9F202=Q0qlxk2P-#g04EPOKtu%H)y)NLW{>U#uk}_8 zy_*9^>PooJ^#jJDoFZw6P#+>b%F9|izONb^A?fRk7N%?)O1IB)|KQl?P8ET#3N$gAk-pLS$qzi=9EO6u#))t zYb?d6IH7(sIg*&$Y=m@V5rjl;qiOF@B)m9^asF+Dn%+*ZA*q^K{r1}h?BE9Af_+0X z1Z$uKwy-f1ck}`naKHk*oLm4t0WKqS*qr>)&*Oc6&ZMe~BS-`61~7niCMgYI*93dH z0gM3b5{`C`E*ef|ATZ!@!X;ce0X%78LVtFv3`>?>?6HUrunGd$CBYsxAh5cO_fNU-@F z+9{4Iv`bA=YGVoV8ttf(=WIFizF%);eyeeA-s;GIt_4o@E*PoXO&9(RcfKzE;a-;^ zF5YnO6#_91JeDr}=T%EfB$;bTgz39>;f#zDS!n&`A=?{^)!Nd=A#ux6E9JXZ{otG8me zU%}o^2zN=NpG98Zk)j3HV2|=@W3@u%2mp7A4A-!nefA>U&N#96$5VQYnEib_SNOcw zY|Ptyw|T*iL!IFQR~f^Hx7tGQi9~~_!uhrKS-@s= zu!ki(?=#U!*tVA1b~0LMlyz&^%OKr&HVgF7z?s!RVqSVwl7O7o_}1!bSFJ|HSh#+LR$9; z*@>ihyeHX6eL)P?EI>J;5qjDHl2-Q{|0?nl{y9{4=)#Y!9nw39v_O&{rX zFg9f>fX0qS6NmK4)|30tXWiUtK_x0~RN*+@y@uwDwQs+QwaGoFQ%=H{3#TDcr(w)! z2-1|Zr81%urfa868WgmUaUs@wWvo&~lLs)N#ZDAeO=h6OSCM}%B|fh#Am9AT`<1su zu{vgn!?bQ$dP|PJO1q@Lyg$245-{y7ZEdhJsXXCz%3hi;txED-nrE5`TOPd$WuW}S z`@1FtwXRoP8bO6M+WFF}4c)?Fa#EQEDdv&pndVXEAwPKf)M(ohgAyBYRZ3s&)J)PA zQ<92L6?;!MH7GR*E`S#}$YuD7Zl{uV`pzk~+t1O!v|xh`hK=VNq#N}^>L}eDrq@7q zl43{_q!ki(#=Y4*zBazdTWw?ftBo<`CzAt{UkX)9 zovoSeyzT7_TY2pGhjGu`@f;KGU+Z@fyEQFQK20~a7d^YeyGf*(q#dLrq*|nD+)y5p zOozh!icA3s-_l>UFIO zmFvgwj1t)l_{16w8#O$|RTS8D*-fg|j`Bs+@+$L+I%PTy0*5|-0W6rth-Gz^RQ8{vnaj|-LKBt)77f-t779H^J!D<`hTWQ* zo`v4b*G3@!{&3qhQ2p5ZrOl6(rK~F+od^CxnMO5BgssZ0MRJU}uoT$PPUJ>nB zS6|&c7+OR9XP)hCuQrXHK-N&{ed%kb=$i)8D%pQJRb0^60hiUHU3Kf8TV=jIDs3AQRJ{{2^(JbR5GtGOb!C zL8rPg@yh(j_UMC%ph%QRtOy=VGnD(YUz^<{+fEH+ za|3BEsWsv@aSO`46?_fd0t(Z&v-fZpa!9lc*~}bj;T61^NKj;O)w66(fBT?# zAq%0&q7Bi*sJE?Kwvqwo4XKSh+dN;-Tz42IA3Php4dAil@pQN9@OTKn>*6_lja3;o zTV=S|tn6vj)v|igu@`3#u$P@b9+n=C1(U5fL@!UPy;G~{Ftx5f+JuTXi>s}+vyK^H z+4b2G*c~ic)>qmtrcoJBT$m64i2k8M`z>*AMbBU7=hDb!=s|n&W$~@&caJBY-g!4|`#!U8s}()tdj-2KomPP+fszNZ zCkf*+VKT-2-Erc%=#fK zh4F+jg6u4TJkNkracI~{OcYUsppKhJOT+qPPE}3S@^DaEwed-_OX*_!?VVOyxmA^b zk&u6mfBQlBeeCRu*4$K+{Q#Stk4N_HJ5KfYscV&QJCOaiZj5f(moHoVH{SlrAw!Ho zxjAI(@cm`{O?bzD`F;Ym6x&#I#E1Xh`X02uRzQ!g!jxMkT6sTnJ!e_ke!l;1&UCy! z(kd>zNrL@X%Fk+*9^~=Tac1cT>0OyhnQ_QZ$dXU(dEfCcByDOftdsHK(^bkud0v3$ zZM&8EVDItkiMn9nJd@=@TgVQ0Y-l+X3j??{g-y- z{2#QdjEkA~f5a;g$j|e?i`QZuR6`A&MNz&h%bM<7u5Rg6c<^xK2wYvncPhG|&}Zg= zCy_Dg6{br+KC4X>lt`5m1PQ9eXzxNR*{9XTO8fK;b^Y_Q-gGZMEFawlm>eEC{n`>( z)BAPy4N-i-z)RhR4Vf~To-sL$-f8lEadG|j)iXpADy8*kZtpt;MtU|j()R$l4oTUL z{s-e_Nn*z4L-9i%kNv=XHRNstkr6@UN=env;}TD@AUHjt>?s6PMO>KcvuEhVG;D|! zX-UaAPrV36)F_NG%q^a=B|ib1i!d8Pna#q^PEWm9Jas&^vspwnvTw1qlVTo3JKGEe zZ1{fGzj<#QpG1u8s8iD^{uzU^mi@bgpRqNSspIE+sUOM2pT(Z2*%gIp;KIou5DvH| z3cSgI=d&E@PJ~B=>tIfliTA=4Z&E8|U?iwP`@Wrh$KX;)@Lh2*J6-qlJB(k!COFwx zU*VmyWCv(nEfKGySVV)e*CKJc@SjMC!-tTEHa5j)aYL9fqH*N}zRG=LAV5y?2xcqu z;brH-sv$QOGIQWBCA!R)R=`*r5YAU&O5H4A+&64%3R zCfv{0vW)I&deZ}UL@-DtD{!qq7Qe~lg~FNNdTNos3W!Mm5(7R(;==k)?VqNas0hX5 z7Y~e7=Xilz;Kz;naGHSRPEE-4%3QPPm0o1T>ATv`2v*js90~|ZC_r>otbuyCE5GOT zx9jLGu^5Tnx*|iN&CN9~l4IfsU&WdeztK{V&Qp6HYVD6u%xZ`jyapqkZ7%QH_tT%5 zFBt~xBA6e{*$Px2M?jvICt2q@8@0FZcUoL%@XFYcWzBq@Xj9{mJLX${!?w$_ORy`k zi`g4|#XhZU%l^)H`u6zSPwVjy&*QUkLktJeXcw=EtJ3A_;t>-DwIG{f+yjbe1m5To z?k^)9;U!hD+NfO*5S|j}#MC~e^du8j<`ORz#<S|$tU@0K(c&a{3yR;faZm>rSp!ZFz)Ut+xa?$LnT--p3c@+4v%C9(vEralA2 z8Pu<@%Ua)58V_K1)aN@nMoP&sV2&c}NyaOX^nQBEDwa}0S}DFaVc9@_AdXq`j9gr1 zg42!k{ta6Ja|zy32)in_Pc+IG1@2HYV)CiaZk(VIg!;a04$l!XjL+h=)-3P}JzO;` ziot`mT>HQZQiBkCsD~-$*G63o57O2*Mtz5-OS%-7sF$h18qX45luATbDN}DWKFWyHn(*maC5999q7L(nNb0?7PIat} zrE%EYcXuyp%^;6E!R4^4HOD8U`3f~phz6995)_Z4ntR2~1!^z(bDs>!4^H{$m&qGQ_FRHZBx1LZ||p;tmu zt@6H-WJOauP&fJ(QQ_3FLZV6LoviKK9r~TQZQ=8|r##`vJvq;yvR1Oh8xT^cF%%8T z0|i0jqf?_RqpRgBqF)tDf{TaPQnXWsnS_}n$`;Bl%O1+)%I3=yH2KPS%W^f^t8c`W zlt$%8Wmt`sDn9Aj7cPi>JhNT0b)0seCZ7qJu__s!(J4977A+2txep5u56_W~mu`{H zhoZ-#!~#jNd5nSX8C8mG8lMe&E5N**)tq@YW!4L}F5Ff&skXD$FJ{{+=BtYfz#j-I z`lj?tw901++!bD{8t5!4326(d_%>e}A7CuB3%ZdD<*1SeDTj?I=Kuxc+?5V8{ZQJ> zumcKMO^PZzX0Px+G?h6=K$m!m(*-l>(#QCgY!=J}tt}l09q4!#Ej*wVE8jl99C*3t zm-7?tOa#Ly1`eaPPcSBlP(}BK1&cZDV}g4skEx9Kivs(&>IjAQN2}Q8-XLds2tjH#jA?-!QXN2ArAi*ZX0( zI6IP;lj;p4Z9lD&E~Uc5>Airg-Yua_JZr_+gfohN!XqsHk4lyH?7cy~TURP}4d6~C~55$NsXJ@k@H__ocijjQdG z*V5_i+1oSYL;vN7U0|P8!Z>FhUq^07SVx3k;Fasi@X6xdoX~tgozrR6VBB5ho%e&# zT`PhvJPEuBvKz8K$|=GgLLgQh7B12Zs_-Uc%fj{2HCM?@B|%AmYeTEX@i_Rq2Bnv* zm#o%6v1-nj^)I9O*gX~9b={X?n4!L*^XZfpIPr+k8Pz2R0m#KwypJ@?s|o| zHo1NpA^DDa#&?{D9*neyzt=tr}{L4iO&b;{m$-w+?A`-+Y~pvt~&a5;fiXZR(FF?VQMMn zn*ZWXV}a5$K9KE*4V|Hk)kl~eyu50DU@qq@69pQqOYAQ;D>fOY-#puDOwmhaW!y9V ztXBh~{Nyy&{Yh#zhKcZk16 zSDenOcNQ;p0y=O4oo(iO!C9&8)wr7Xx}#N9r2&oEop=50OC8^?HzEi!nAs}N4V&!= z>{BdeMrVS`uPP>tHCi?&7D``d&DPFpRo=SZ?_Utv(#_bdylWBb$bStwhi)qIW@+2| zH0m`p*L%nx1hKw1NVWI=s8w^Z@adg>?BeJr9ml}il9v^nfaNS{h|Ywg`ay> z$wmhzzJW`sj_M{xzvdhU537XHe8+gbc)0g@mRemXz?7K-`2wtNmr|))WXdvY-Y32fDl8~88nqeKF`zP7(7)G@ z^E|ra;=(zpMY2j6z47-o9-R|)=|7lXciVT~#v2Dw~d-Koe@s!!}r0U!z zLa&4Ki}*d!Z{L@jzLHFNUs#@wE;d?z+ihWOTJoj88ar#1@{BlS4Uh4kL99gO!B(P} zy?=eziRI?)L4Jg_b9gOqvgI3h9hdu{emvZo8*Rc9Q0Z-V(v&~fex+-oyOKgbm3RI! zMTC5}`cCP#k#+oeuK$(qmm_k-DHNaF;oQJ$&`*ptqSjy8=T?iR-@f;xd9z0tYqj$R zv~R^;WnDIwI#OVd`pN`0o>ZR%e(Z?z3H&9o9D)4{1r7V!{NE1l5l8&DgZs_0{msHX z(vv?O++T3T_5X%T9tns$3~j)yo`Q{qD~yjk(izzIJ+h@gdCk9pjf9z-nVqB6@8rhy z4`lP#9gl$IpGo}2PuMlw&E0+jKAJA>;NP=*^r63hDE9agXI26`*g3MQgRR`{%pTDf z47^B!T|q83PHv7a=rCCF8~l-SfWa^u2UrG5W*~J(dozc>BGNW4u5J?6u)BE#U{=-a zZx=c?7H-y$qvYk_LjT7Y|2=R2?D_Xhl+Em66zBhvshyb>3{Js-&*PoqFfxU1z{0Czy8-yP1kl+v+`#tQ00G#TbpHtRaPt9v3(ErdA3>9c)o%lQ9Npu@c>V3_F((*K z1akp6f8%F=<7B@D{w&w;bdeJP6 zQ$d~rg70hmRyH8BP;*u7-&kMM=3P7Zc zZcF%mlp3Le1ddIN5*e-xXid_lE>}7X%RH8(@4)26EY20t+Rt@&{EW?uob8rKk z+u`Jlj2Q0PZo=tKl87(oLW{;6vzPcBf2q3}^o%Ut{+RMD@Fg6vE5T!<1}2Pu35$)% zVoBkm?bwdlz1i1_X8=379Ilk$&cAbO#_Ivl3%{C|^s&vWS-w%(rATPGo{FooON9`>+@8m!6GSgtikhx^xPrx zVebtdCRxfgU7jSG5AqI68`RmX*5LGrYXW*rp*w`x=6*ffjD4a)PIbuiZ>uNiiRoKL z*80V7u9==Y^2GTIOHyF0>VCfSdD0=MU(mVRHGz8tPTvZ5#JG zdQ}w==L{e^l0F$*Y`!6fE|vq&7n`p&FqTPU9MoF+%|$^gSvx)yYol)a;I=L=;UAOL z*qg|w^;kuJdK{$=a)8LUvLQdeT8~aMtk{Y1mvD%*KGM5lbw7_IoadL#atsnHJ`sG zShK;(hO(2Wa0VpAvO?(doVMu|PC^mm96fgWBh2eD;_OfN`t@rDdqN++M?hv5Ywc`ejG?*ir=b!^tIf${YwK^!88y4T)FhNtuOGu)b%g)XT3e|)u4Rp3vPlsOm3xEo& z@p?*dE6Wg5yi0K2nRC-6x1NXEDMpe7y~@-xPAwibVu&N7(bxN&KU}9usAE)s3|(~p z9Q4ZYV>It_?)&~lPLV2vpqiiLP<}ZxLu@vXp&&kpBuzGi9St-x@Zh6}DGg-1Y`y4% z#MnBmMn2qhlR|KLqt%$giy;YI!gZq3=Q2^UwBE$Ivntp}Z#wy9ZkLYxV}VdBtcJ-r*l@VT|QWkRnz zV#5$ITyGo@0QBbE1FrDZLrSpUxz#%Glz1yL!nN!R`&{*H3iaHxr3!%8Ga{#D*tBEOK+dOb(U%2ZEZwL|P|8sjigF3#M)7rG z8%A&(6bZ+DN7{oA^IJDs;&{~x2cpnZ6&{YJwDsjorhL{ojOb5!7Z`um_yttzYx0p3 zsTe?!;dj zSz5&G7u;DolRiG0ce(62l(xN6xa|p<7rX0H2J5GLGm-K#l@29pQ2qt$5jAGo#kI*% zY0WRzQJkuQV)nM>>q?BQ4Z5CqQT&kYpZHn48yD}*!o=t8DX#IkMm|@l(%+iAQZ;hEnKJZ6?HfHI1A?`CMAR_|gWmVaV=T=-P5QEK)i;Zkr0ip20HsFqwK0ZUF8UwD}bv7=` zIaHrIAvkh+;zNJw9?PqV`B(3-BQ?4s+zQk?P(S`q)Xi@Mz~lR1v?p4Xh?}?K1}YU> z>|*`|^C@3!8uMu7iq14rHzp}5^GT-VibaMG2xanPPP{TlMgkyZ4@w;74NQ4IrDG1|G2m4Pg(Z6Sbh|7Fv0h~s%VM-O-28)rApIL%faTi zU1gkq?0Wi3J3k89|7hpOi@$5!#| z;M?G+yG_Wkxe)aYqMGU&$eskxHXW~^M!F`b+jWh+B-YMtO?&Hy@Dti~nQnT%PMlU# zCq?XZO0$5+?>H4umYJi3&((APi6&c!6p?`zZrxWpwU_L9t`YHZzBQl6KrS7>U+!($ z`D=&?KQ*m&=c<3|S z`bKQ}fabtitfyWL_n%BP`5}3*0<$(E$KOY&O z0L*dxznBdC83Iwy09U^1QA@sgQ*p0-QlbP4G$eu|bzVw%PI+qUd98#wlwjOo96xT% zBGm>K)1vNZ*%^Jw;@PUyUHLcfweVv!pP8%qnU&)g&r)HNtrl5X7Uo^t-k>&I9cH@* zY~}hI3!RPTyho1N9gwR;LLaqD7s=aSeaTR9HDZwI$Cw>JmY^o``e8pDf6PzZ>DRog zV_r)47_i%^@4gXz*fjJ>y!o)0<;g>URUhiPLg6=@#yrN<4!kD~DwTRIcK4pg$%M*v zC5^e|>|9yli~>8pOES z48{U(lAK$p-qn>#xhbPR87*+h&-m2&bdf2?ut2UJ9ARH*Ki_as!xLVqmj5J##LB=a z{Dii!{P{22ej-+Wf7=w1QT`>fTtSkF58A7m<=&^gaQ3aDD-8ih;{vYv618 zH6T=U=%kb=Fm5fq$P41kGT+WVZ6CmeML`j}8@I+!+P6Y1Bf>N(Lrg)iDgiZd>^32$ zXjnVG4ZCq7T4wL{b#5E>ia5E<_q*p^Qx#c`UKnINy`oO2Ky%@C_Xum= zPzvHsE4Ro_EcSq{-ZAn6A9_&#C8LjwkVix2;v%BYExsLg#sRz?=p~U%U8T6_u$S~# zpwV^Z^}^7#626MeNis&1;a%nlO~lv_9SA*f=SUwRbSc}c)ozts94A??Z7<*bH2&(-qfE0bZ`2N z`Q#{;x}(adpJZ=+S%2=Wd*$XrE27T_}4rOXNKpw^;=);8Uo zFs%Yzf|IBqvy_l6%8%0% zg{ZNGZ$R?vUN-QBpG#-Rf{}@{Cls(9WNNro zP}`r-VKUp5E*2`BsoEBk@hPAnyP<*Nb+ORDBY=uJ-f+e{i-2BXA+$5Oza!n+c$e<% zp!`-OZS@NIV<&P4;AZY!p5$Tw_V6}!`#}#wVqkFzPCH2$1OY1WV>P>WJB4DpohjkS zeeVv?=CB@RluOiHCz`gJFeIGfq=3|jdZ>BiVyK*x6q2oZWH_I(uo22U(`vV4}jB6g_1qn&hu@=+(Mo<0|GIY^`wqn#hJqjzeG@f-eMZrO^im1RstBnOwpX!DAyYN z!)4556bWB`gt-id9O^{354DoZGbW=n!tA`^_Gy~F)94?S)<<1FeU%vAZW`)Uux*kR z0y};^kNFhKyU@Dm6iR3&bnqc9%+vKUKAZK5c@r8DGSW!_4*rX41GW7`)HN>vrelDgtk^gG4`F z<4zCOvLxlG53EoIQ0mtMTt_nN5Pj*KhA%YUDM0es9O)>pB_;x#(RZ;e8r6+Alx-S; zbj*_enE%^m$wIo~vs{r=MzwG#nElP8x7mShLfQJ+ZhE|7%9Y;$Q_VtMTq)FQGj z3lwfTpr-B_0)&Odxt3?>`YPBlD`&+($=D{@0zUP6=bn(j zmwD5|T6BvPnbPrdsn2(7M4v_D`?JnUQE9t<*pT@Y>CHm*!eRde@J%}7w4C|hre>1d!?-vgC`+CKZGLT&<6 z)Ise-rtRP~1=^5J_U@>^ZJqm|uyxLH9UYCAp|X6o}8sBbZ8uYy3Zx52Y(VffB|6UWJnFb2Uh%Kj-sljkPin?AeP zmJHqnIe*KR&*ZXBcYj!+QzlQ=N~DPpk~_~d1k1_e=|xoxtqVwJfRovzCBm6HQTx1q z^BJ{!u^IEWNJrYP*WBz{F%zR#^67_%)jx=_T03&1TZj~-F4Lk?M)*)94?jQ}q|P>< zlJK_a$0w@1)wFs(8=_?v%So# z<;DnG$A%qId0f0_csz9>WnuH580=dlAJ0WFND6XsaB*|+!xqS4OWhp2`WzheupijH zN{$x)-%b9eF}+;CmgpP+PJVQb|GWUaJUl=ifFzj8H3>pKG<{ck9M3~0z7{ko12Rn zj48PMc34Tn#s>_Wd)UbxM@JY?`kf=O%Q;v&0yzITF(W4lLoYAQEjeH%0dhaS0XezN vKzuyhmOR`7ykKq~J`f*J1pR++^1A_bb%Q-@zg3+R$O+^_r=^urkw*VN7uc6* literal 0 HcmV?d00001 diff --git a/tests/test_basics.py b/tests/test_basics.py index de983fcb..98932280 100644 --- a/tests/test_basics.py +++ b/tests/test_basics.py @@ -192,6 +192,19 @@ def test_password(self): with pdfplumber.open(path, password="test") as pdf: assert len(pdf.chars) > 0 + def test_unicode_normalization(self): + path = os.path.join(HERE, "pdfs/issue-905.pdf") + + with pdfplumber.open(path) as pdf: + page = pdf.pages[0] + print(page.extract_text()) + assert ord(page.chars[0]["text"]) == 894 + + with pdfplumber.open(path, unicode_norm="NFC") as pdf: + page = pdf.pages[0] + assert ord(page.chars[0]["text"]) == 59 + assert page.extract_text() == ";;" + def test_colors(self): rect = self.pdf.pages[0].rects[0] assert rect["non_stroking_color"] == (0.8, 1, 1)