From 748da3f2beb31e3461a98a4bcb9cdc83a7b1bac7 Mon Sep 17 00:00:00 2001 From: Mantas Date: Thu, 27 Oct 2016 10:19:58 +0300 Subject: [PATCH] Add ODS support ODS support is based on ezodf [1] library. It looks, that ezodf has a bug [2], where number of sheet rows and cells are calculated incorrectly. Probably this is the reason, why I get extra rows and cols in tests. [1] https://github.com/T0ha/ezodf [2] https://github.com/T0ha/ezodf/issues/12 See: https://github.com/frictionlessdata/tabulator-py/issues/28 --- README.md | 2 +- data/table.ods | Bin 0 -> 10142 bytes setup.py | 2 ++ tabulator/config.py | 1 + tabulator/parsers/ods.py | 67 ++++++++++++++++++++++++++++++++++++++ tests/parsers/test_ods.py | 51 +++++++++++++++++++++++++++++ 6 files changed, 122 insertions(+), 1 deletion(-) create mode 100644 data/table.ods create mode 100644 tabulator/parsers/ods.py create mode 100644 tests/parsers/test_ods.py diff --git a/README.md b/README.md index 5203d464..903484ff 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ Consistent interface for stream reading and writing tabular data (csv/xls/json/e ## Features -- supports various formats: csv/tsv/xls/xlsx/json/ndjson/native/etc +- supports various formats: csv/tsv/xls/xlsx/json/ndjson/ods/native/etc - reads data from variables, filesystem or Internet - streams data instead of using a lot of memory - processes data via simple user processors diff --git a/data/table.ods b/data/table.ods new file mode 100644 index 0000000000000000000000000000000000000000..c4fe3c91f4aec91b1a66035bb3f1fc26730c8414 GIT binary patch literal 10142 zcmdUVbzD?i*e*&+2}%lz2q+=lDhMdu4U$90FbptscXxMpccXN7cMjbtec_zXPfvJ$ z_x^d``R$zjK5Or_*M6S0-gij|!@^;~z#zfExJZ5xbO*Bg(7?dJ+%I>hV2t#Qv?11D zZ2%Yy)Yk++^o>EZ79cGeV}O~y8I3Vm8>D5dX>Ozqg3y?OO|=19W_sG%5Q)EHjx^|% z!Nb7ZFL-yDGJ57l8X$l^(2N%HW0wXD()E@Q<3~e&hJ1GknxMdEsk?3E-ExZrd$)G0 zt;~agd1x&mBEw5fMovRT%}7PZ!Nf|>!o$tZBrYWMnNNyaKuSnRT3-CKgqXOhAdjMy zh?JC!lB|rXf|RJ7wup+Ml!BJ5Do{m5UD?14AjJ=m6$B~sYRgFg6y@~Qy1c53FH>L3>_h>xz7zn+!1xt)utou9pfho!5ZiA#*FN2r~Dm`y-}wSS6Z zaEdEf+1XUf)lARJ0^ny0bhm}LI@)^KnFQJEyExegI9n$9TRXeBID7c}d%C&!MY{UM zdW6RNgvCeuJ12&?g@%TNMMgzLCq&0Y#K*=ug=P7LriRC+MI|H$CzeK~b9$$LBXB2RfyMxaNgAr^JLMB}V7O2Nxy>*Jgz`6eMJ3X69$6=VoQ(<>e&h zR^=6z~5PAzXLsB6it?XGHQ zDfl*7+tOIo)?L#*Th_VIFbM6cD{N_~8miA|ZK~~UEE{MkA8M^0@2r~ZE}0*!Zf}~ED?e8CGo}BL)o*eD|Hr3ZSFwj3dGBPqVH#s`eIK0w0w$(Vf+cUn_GP6H4 zw=p%p+_QK%v$8e1yg$8hJhAzGVYFj$vVU=5ZgXL92Rga6K6eJ~{=V9CxITKkHGi=; zzqGsp-Q3w&UEJDS+uYh-**RI?JKNgdT0XhjIyl)mxj5ckK0AP(9PV5lK`)NB4v&wH z&o9599A2Lv-rU^Wx5L}paVebgyGD2__?cVAYHDZvTv$dD>zhe8PZ>gIxJ&V9^s~3T z(|M#M`YpwSc-Vs?SR~OcEN{q$y-$aLmdbjALJY$B0567T1 zz>5b9A2qnAn1q;ldUYM=H`-&t_?WTlHus#+cP$MPF$c%9< zf9+{dd2o$hEHz!fI9gw_ifPNOK&P=jkWXiM^}Ka=f*)U_zT&-M6$c~L>=z6YKj{h& zS^3Sm?1D1|EfcSb_xf2JW}?cY=KD^;H+B&c*4E0Lu}& zJzr&VD_U6!73U1$Ncb9bELT2VT_?*pJpUb1k!!*=v$M`3W`Ph><91?TFOmNqe3+rL zIwh<`5K-MMuFaO}w@g2FH0fF~dkX7w`FWb7l!?6y%i;=+&EH!-b)36q z)JPjE7baa%b|egEdm+uJ_Yt%uzYDU>k3N4}Js?kQP*+`Tl74geIQu*l*Yj)KKrg3N zTZl}4_q8oV#BxoHD{~DBIA7jmOz$L*ON4SlcdJ!*V6}9lzhsfMF0xyYvKKn|+5rq@ zcM=$JJjM039ZbFAyxDy-n`-lIA#?RLBIUNH9&8hZ8*?+k;GKp^=|MvM8l&J7*4UA= zvm?D;zUa9v2AMHz8$y1x>PxNqeCo_xnOY(~+ReUjlC`1IwGuzJ#hx%+C}sI>zi?MS z4Z_@8&u{WeIhNH__PDESOEY0oXJyknH+}Yqte}}HljSylg8zSZE-epG3|T{1&8(#F z*%(HUSL|~EkGjjBOpk_zArO6#t{IJ$5pYLw)^v&wt6@)?uBQD8 zueP`?sF9QgG~D7^G`$K6MlV_(ebKv%%)L7`-r>7KAHupR^bnUyirS4OwC@->sBHcm za=LBVphBZoU_Ykd9E4>(zZqASj{wHy=)$$%0@=*k0qO~_z}`pf=8dNVX!Z*^6x2A#EMY$jd14Ft7K#9aO7<{DxZ|l^<%i! zl+o&{Dm^kw%fOnBHHajO%*C+-`DZ&+tj7YKq>VK@7KPzA70;AHY_m7MxT?IcDEwG8 zgCOuRM#h7r_~%ha())rcUl8LSJOTI{JEml|p_j$$s+U z*~-fWRyui|Aji~%FRR>VHua^cgY8_n-O8)+rKLkFv7C=$!fcBWXbF&0uGodz);FKT zGPn`f95B31e^LXJ$E}PW?;6#Xgr6Xt^7N2!F!K|qhc><&Tei7W$r~Cc^bno}mUmSRCW>#yAW%quFKkOPTgAiB$RGy^6~e z&TsPZi@mb5U#}PW_HsO~?C_;y^tH5(i4&EP-U|?XOF&ZA%BowZOAOW1RSqPP@7>LY zHK;dAG8sv^)Gcd4nj@GDSonG#2Mn8#e(~DbQXlk{&bntET~Pv}9LMKysq`4)d*Xg@ zCJ`_~x&9B13C?L`Y{U`!+7gLF(Y2;Q)Z8H&!zG9}7;L2N zH#3L#dDIF!$2pJ}3{CB{TcE_E`__>t6OAU_e`C-!`PQIDcHLsb?q;m3Jm8TVl`zQzWv zsB!Hlr@760%*YaVPTG$J7mwed))^XUqr{G%6`tJ6lCc;)-<0asW$r8rWLyKGeK|B< zn(;GM>_FZf*1Wc~Ymta4l73x@x7w>dWW-fB`th*;*%z{!SJ21$@)VqJIIhuW=U6Az z@yo-SB^JHtSeb^~DqEAs9iliU;xHer)l+qOPlRO$12Bqu=px$jW^ zK^naQ(<~SbXB4Qo>;6=?+ZLs}y9m54FQ&94wT4qFog&DiACV!GYTMcxs*&^$*kUB@ z1=W4Jco9!b79B#y-}#!GV5P*LeZI$z5Z#k;^L*BblFu-MDRQ#a%7zI)PywgQOSiN2 zdu02^p!dtQk!tbXY^%gdpBdgtMc_Z_>iOo#pA^609;)hx5>8Dzs!Yoe!%$DR$g}!3 zw+x5V24G)}Z3JpjH%B=wN1KjG?(#BPAA*AWakdTleLxPqY~od;pM(OMK@GdB?m}K;?Xq@M5AZ$4<|Kc z^b90lVQfiv`9L~H76v3fory8ai@?HF_S*1x*qko#AY)D zdSrHeI5d|wkmFdhePe~hbGf$Ys-l_=$)asx;lZX%cs!T!>GDFVD~6J?svES?_d0a> z+x6){tn0r2k_W}(7cbmAW!%hLw5PC{zKOzeVN|q*71ClN@SKtNJbY8o_->Q=db$~L ze9ls;)(f6JG4Y43rr9FTqpYkB+=pXEXXIXEU#G*SV>3xgjy!^Y zcLdXgq^wc03Oq7Tt$5MrS^CZ8%Em?l@a_?Yw3;B>uF8SD;5BD!NyjUB1ZBDj7G0lI zFXY?`J%xQng^n{i#RmNZcfRB`5t+)bb5fc36oNU#_%AoK8}`d$_dJ9un|lB$-Q*x8 zpvr~BR`tfa>Gr^fVVHI-ivV&C>zyW`o#F{X0nU1os#h}qOL{bXZw2yWXm#$v$zjJe z-yH7p9(HDsp@ah!S66EQ%$lhtBl^WPm%GByzM^NT zu^!Ed<#UBMRUH0^gbBQ{pT`_%-ElxgwnV{ZPn$}MGcdU8G>wx_@k00l3np}q1u~en z(#M+Ykqyo!r7sLk*fd_~Af*!+DcBH*$qxxclAJkQ=|k*|kG;s;lX4=}6Ppk<36522 zP2+J8(0lfS6yBfMmpKyHqCm?m<2qmvy&5Z^QCGF#U7~*P2HgQ&e>Br z;MXy;ABMa{MIc+WScbNfp+cQ80K>>@8bi0l&=W4#K78If;}iNk5(}{izpzDGVaUHV=3(LD4eFUQ2emd zP=v&fJn-TSDcZ^45&!cF2K0Qq)Dr>JSCmYpish0_m7T7E+OHqo6qLOxP*^8z%9@IR z45K6~y0~s=sW~wXe345mRHcYI@CqhfeSa%f(dN$;DZN3R?#v(&FF|epMXz}|mf>V; z3ehC>QIKkR!5PgjIbB13v`IncEL$!@GXJ*^U?eRhWM2>rCjA4EdF@P1RGwGt9K>*@T(FT zZxazFyJj+g5|p+txR1rMR+)0yzVp};2EL9XX*bD0GOP2RP_Mtbk-mT%h@_A6|c9Is|NF~GUM(mD~ zqL{I$k#D)N=l0%{UYYpSddQ>MrUiIukPI=v`~_}weg19Tsg@?)UbM+ACo|1 zPuN+gI4El*leC$~7W&lP8r@8?g(*T+o}@-0j6~2={{3Vap^D{3QcP7&?<+h(F)=EU zm4K<|pl0+~2Z_YYdGrJ8nbSu{0-m1;gTKstle<3f@No&p79p^rc#<^=l`lfa$Kw%n z=uACIx>~ayV7XbGU6h;aT~y-v5Z|M7x>^3v%Gb*xj=3W3`TIckvI;kwY_TEdQ4T*vF9t36g)wVTX@ySsp7kn$smP%NtPrY4+KBhSyqi4Q*UP zDKeW3^Bj?T6K+=<%3CXN1(IVxOkoP4Zvsk`(nf>byH6cHi?`O!G&E2?sw*qWZ?sZ& z#T8UwMkC@=8hO{8uP4*&pOw6*YHAC|n7_~7t2r1HSI+RYay|2mUrb=!T)$EvKNo2l z$U-XirYsD1cjDdiY^f&$1FTGL>3$+WAIrN!%ePdz=lD+j(y%vh0yM%kJS zIHeo;Kil7UZb6fk4`5(Q@A@#m@4ws! zugoCUK`&n{jH?N0Q2+a42y6 zjGSbgD`ci!9G)DjA(nVwc@jv`f}Fl1`h>7O^S5lHnSS`ppKp2V@oZS5?;0zBuBwgc zKoG)S7MQmP!;fxq-BC+;18oj$JKsOww#3!qJbV7Z+&k2RO@prq1RIB{4h|D3?dt{z zrBZNvCyJ0@EopVdQFxn>B8vJBNz==Yy$UmbPUGgAir)ZGK{T5jQhkumw?nsqw zzl-X25%+CgOWRtAC8b+!0Sn-ftjgzAJ-S+7hJqUvJ}~z}O|8oai7+>#Pqi56c?%gP zBu!0(V2PpcUZc|`zNOzGnGP!<*2&7XSFX^L3iFVUvamqY$g0StZ++zT&6J&{mVz>X zVsUF~%dfytd|5}=Sx-0a(ISIP8^|)tZ+2+{qu5=?vA=hGfa1pL+Q6OanMJ&BcZ4!j z;^S^zk4Oz$0!E4m6t+YicQxSc)v!)5a;l-B~*&&20SVE!c$(D&+mBscn0vsIaN8S zOcZNx-(+A=9?EaQOfyvbAwK}``2e#s16 zKgX9Eka#6Wgvc;867p;mL&ME`kspTPi8xBCaBw22=2#gv(Oh(0|3W76z7D7OBHeQ| z-NTi*r)e<51@LmL{%VH3A z4vB{(PZr#eku|)x3QW=3CqZ_hTCoF0Z7X4gu6QC$=4o0os-A-dyr^+5%3sT@yc1Uo z^Ri!s@m3|-e;Me+oM8{+M(WCH#csvVgQYK;+p*JJ7Wp0bc)V;@6F!>e_3Ud@&)SJ& z54sS*N^9cH2di6+eRNxWr>rrqS*N)?W^+JfrS5sIa=7mrIoe^KoIxKNj-Ngo8~=P)jLC~4d%-e}|Ecs^Egz`y{pCNPwz59h?WNkUY)r;~{*IP%qF z8`#&+2p9PI8XM$^JY=yE2!+XWcz8PmIHXpFc}`c5wcCAJ&C5T~BkzmWYLJ!f=jOZ7=UlKUv)n}!YU~vVIRu_@ahFQB3}zgR3k1c&mSp1?8AM^ye{)DE=(%R3KW}B zvQt_aq1?*+NL|{h`kS7QIUh{V6$U&XA%9GKe!D}JOc1XL;ffL8KX8w^8E<54`JgVl#R{LUd{_V$O=C>8{r8U4=y|!LE2L0$LayU8@fYSyssfkOymD zQi#^z+j`}4O!n)nKkQM}tV(D#x`b%kEPW3Ox&$*2>^v~4gfE+=PqL;BdqBGMxCD7N zT4$6b=OI#V_oBincloxuB{SPWy&f-0e!Gb8o3~yRJM_cx>@|`pqpRGxL_&70lrYn+ zj7nDOm$1v+b;F3cYj{&=?(O2_Bi1@kA8#`@TlY9|%;6Wkv4!2{Wa#94U|S+TWrwkM z?A8<-FKKn*#uKlL9?n9x!gyZw<}>zrs`kew^X0G!)7h zHGOZrV2AQ}-G5@_LERguHt$mdCgiHyO0~U}qOsapO1?3i+zHp^@^bblpfTXE{zK5f zhjgA~Oiv|F8z-ngy0*0vhw(=HM0`6Q zJ3Dr3rhyvNmY&Qv-maQdNmAdIbEsFdb|mbeP9AZ-jAfU|m~j`k?w@)5(DhbabA?Vs zo+13y_1%_D%gwn`tvnHdI45ZTV0{4`XEfG^f?b!Xv|7<#oq>g8_TZe3u$Z~kjJ4}a zyTTg2rnv+TL{i66x*|e=-OeXad;|yLHoxM{NAOi)oxbUf_*8cu~Lv*36m!kXSl zRK(WgP)>Hp5P_$d#qvP$$vPWEy!?|RCOW4Bkn{r2-Y)BH%^JUN`NR5t8LPV9$~@1{ z%PxBtDJGZ`^#fH-do)a-kAvSi*7O&a-r#(-#Qhjt<0JWEQqc7<+AHI@0 zj{{ncD_fqILB+tL+MHVLwT)8#yBDz1qoL^P#yf0NLyHgcqnUmj*!-16U_UlVy(V_vGx!|Q(5)uoBE`q_zQ}{w&Br6v zWgOIhO^_Y8gh#^LcF2;-PBuF5X`*mPapp;%NL1DWjRP1H1}|D=Rx+$;;@Syp|x zFw_Fbml)jMo2Xz;(%a?x;4qMIh8WcXb&{C+un03rDWqg5MzuYe1BhXuuAcaMpoh2Q z2+jh*Zi7Gj)$5y60)v`{59k$R45V%y5#;3|;%BwlF+@AC?a5hFvQ#)g9cDjpT| z+QGpCQjc=e9HI}UErA$b!Ljv-eeX*8p)XhLRgJJ@ zw{?~!%OzmeGRJ&8nfIheXd2%1oPf~)VI;>*Bqat-$R2+A`*sOxH;~-I=a^>C2rq8K z2$Osqd}3)OyDbs&OjzdYTtto+OMc#3urG zw`+w)-9p~=P2w}sM=I83w1TxcHm&?@%iD{?^X~awhx$v}sud#3ujoQ@=1O|}ItHqG zAIa%s`w>dd$!1D#VedQ2SLZgy2Y3D87N!5{D8ISuD1R~r>FDd4oBokjHe;X#XliN$ zweJp%O=&gFO-=tN2nc)E-$uM2MZrA0U)|YX^UHsp`*Z#5Zu56oh_Nvc3;<~ZX@Bo` zpO^l-pU$1IK(tN&AHD8!;pQ5E|MI#UE&ih{eItObwizvtKEwzBHv5kv|Lp<6#$a>s zeYyYPNBA#=f=!KeO|{L;{`+?St!nP8^uN9S?LN&Q0Eqc7rTs2#WCQ@|>u8%nXiT+q zdP93bu5=h~dmr4j`zE+(;o(imyQ&QmzcG6#z|$VaJeg|<2$hpe%h^mESuWsXoDV3A z-2o7G_?Bcf$}nkCkaY>@m2XhJtEH(Cp>+9{rOQyJzfuOOJHqB_Uf$LDpw4Iecf|Wha#& z11hBIJu6-YJ9GIz66tb-xDwWg3AZvv97(b_Chsh@tJe>^rhHJ}rwZtwnDau-fT}Lk z7wsy$&fA6S+3afCm=Cue%#(zv$1&`RZ`QuIzFoVa>A%HaDlD`34CLxd)x8K>D?22T zH=lZfZd>KHC4S$GgZwzKpOzGabh48W!KQMs{zG z7n>rSjmt4i&Ib#)Apt!;;%fVyATA29X=VVZ+BY0p| z949N^&}W_Hd1F^4XW4rB4u6h?XBM{F1GcPh#7&knO(9;0#>4EbkSFmR`_ApXGft5NKu}%L4>Q;h|z$40}=6u6`0m`x1}bq zMb_`g4;q_~bc+HesvB08od=Zh4I7&m+VX&%=X>W@fm@Bf=`N{oig9I?NKYe8eL80> z9%DR~rOgD;j1(5CpQGK^v2YvCB=KDx^TOS8%!7wmFu$J}b63&7*pGQK|M2?T+?Ss& zGIzVbn+fv+<;Mr|YwqMeulmzC@4`>NW?BE?`AdlEKDYAIAa@k~OQz-Tj=zRG{@~vK z_2kFAn-=sx@vOgl{mT9Kp^%@(^z_%j$lv|$ulqsn_sa0oR9^g(EPtaDKR|!XW%>yM z19MMDKdtOvKz~(ych{0quI zRORMmlwVcp zPn;j0-LI7U*U!%U4(ET_^-iUJG`%05zY@tkCI7TC;@=RogfRTww+sV=dG`a~@edx! H{n7scxwg(; literal 0 HcmV?d00001 diff --git a/setup.py b/setup.py index 4ee4ea53..fdd8af30 100644 --- a/setup.py +++ b/setup.py @@ -30,6 +30,8 @@ def read(*paths): 'linear-tsv>=1.0,<2.0a', 'unicodecsv>=0.14,<1.0a', 'jsonlines>=1.1,<1.2', + 'ezodf>=0.3,<0.4', + 'lxml', # required by ezodf ] TESTS_REQUIRE = [ 'pylama', diff --git a/tabulator/config.py b/tabulator/config.py index 6c5acf3d..f6e4585d 100644 --- a/tabulator/config.py +++ b/tabulator/config.py @@ -31,6 +31,7 @@ 'tsv': 'tabulator.parsers.tsv.TSVParser', 'xls': 'tabulator.parsers.excel.ExcelParser', 'xlsx': 'tabulator.parsers.excelx.ExcelxParser', + 'ods': 'tabulator.parsers.ods.ODSParser', } WRITERS = { diff --git a/tabulator/parsers/ods.py b/tabulator/parsers/ods.py new file mode 100644 index 00000000..f6dc4c1b --- /dev/null +++ b/tabulator/parsers/ods.py @@ -0,0 +1,67 @@ +# -*- coding: utf-8 -*- +from __future__ import division +from __future__ import print_function +from __future__ import absolute_import +from __future__ import unicode_literals + +import ezodf +from six import BytesIO + +from .. import helpers +from . import api + + +# Module API + +class ODSParser(api.Parser): + """Parser to parse ODF Spreadsheets. + + Args: + sheet (int or str): sheet number or name + First sheet's number is 1. + + """ + + # Public + + options = [ + 'sheet', + ] + + def __init__(self, sheet=1): + self.__index = sheet - 1 if isinstance(sheet, int) else sheet + self.__loader = None + self.__bytes = None + self.__book = None + self.__sheet = None + self.__extended_rows = None + + @property + def closed(self): + return self.__bytes is None or self.__bytes.closed + + def open(self, source, encoding, loader): + self.close() + self.__loader = loader + self.__bytes = loader.load(source, encoding, mode='b') + self.__book = ezodf.opendoc(BytesIO(self.__bytes.read())) + self.__sheet = self.__book.sheets[self.__index] + self.reset() + + def close(self): + if not self.closed: + self.__bytes.close() + + def reset(self): + helpers.reset_stream(self.__bytes) + self.__extended_rows = self.__iter_extended_rows() + + @property + def extended_rows(self): + return self.__extended_rows + + # Private + + def __iter_extended_rows(self): + for number, row in enumerate(self.__sheet.rows(), start=1): + yield number, None, [cell.value for cell in row] diff --git a/tests/parsers/test_ods.py b/tests/parsers/test_ods.py new file mode 100644 index 00000000..ac2c1cc4 --- /dev/null +++ b/tests/parsers/test_ods.py @@ -0,0 +1,51 @@ +# -*- coding: utf-8 -*- +from __future__ import division +from __future__ import print_function +from __future__ import absolute_import +from __future__ import unicode_literals + +import io +from mock import Mock +from tabulator import Stream +from tabulator.parsers.ods import ODSParser + + +# Tests + +def test_excelx_parser(): + + source = 'data/table.ods' + encoding = None + loader = Mock() + loader.load = Mock(return_value=io.open(source, 'rb')) + parser = ODSParser() + + assert parser.closed + parser.open(source, encoding, loader) + assert not parser.closed + + assert list(parser.extended_rows) == [ + (1, None, ['id', 'name', None]), + (2, None, [1.0, 'english', None]), + (3, None, [2.0, '中国人', None]), + (4, None, [None, None, None]), + (5, None, [None, None, None]), + ] + + assert len(list(parser.extended_rows)) == 0 + parser.reset() + assert len(list(parser.extended_rows)) == 5 + + parser.close() + assert parser.closed + + +def test_stream_ods(): + with Stream('data/table.ods', headers=1) as stream: + assert stream.headers == ['id', 'name', None] + assert stream.read(keyed=True) == [ + {'id': 1.0, 'name': 'english', None: None}, + {'id': 2.0, 'name': '中国人', None: None}, + {'id': None, 'name': None, None: None}, + {'id': None, 'name': None, None: None}, + ]