From e2a2513c3a5d6f0343ec042d3bb117eebef0cf35 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Tue, 13 Dec 2022 12:12:16 +0800 Subject: [PATCH] ARROW-18420: Add fint32_with_null_pages.parquet for page index test --- data/README.md | 1 + data/int32_with_null_pages.md | 73 +++++++++++++++++++++++++++++ data/int32_with_null_pages.parquet | Bin 0 -> 3829 bytes 3 files changed, 74 insertions(+) create mode 100644 data/int32_with_null_pages.md create mode 100644 data/int32_with_null_pages.parquet diff --git a/data/README.md b/data/README.md index 4bb59c2..b5d05a2 100644 --- a/data/README.md +++ b/data/README.md @@ -33,6 +33,7 @@ | alltypes_tiny_pages_plain.parquet | small page sizes with plain encoding with page index [impala](https://github.com/apache/impala/tree/master/testdata/data/alltypes_tiny_pages.parquet). | | rle_boolean_encoding.parquet | option boolean columns with RLE encoding | | fixed_length_byte_array.parquet | optional FIXED_LENGTH_BYTE_ARRAY column with page index. See [fixed_length_byte_array.md](fixed_length_byte_array.md) for details. | +| int32_with_null_pages.parquet | optional INT32 column with random null pages. See [int32_with_null_pages.md](int32_with_null_pages.md) for details. | | datapage_v1-uncompressed-checksum.parquet | uncompressed INT32 columns in v1 data pages with a matching CRC | | datapage_v1-snappy-compressed-checksum.parquet | compressed INT32 columns in v1 data pages with a matching CRC | | datapage_v1-corrupt-checksum.parquet | uncompressed INT32 columns in v1 data pages with a mismatching CRC | diff --git a/data/int32_with_null_pages.md b/data/int32_with_null_pages.md new file mode 100644 index 0000000..fe16340 --- /dev/null +++ b/data/int32_with_null_pages.md @@ -0,0 +1,73 @@ + + +`int32_with_null_pages.parquet` is generated by parquet-mr version 1.13.0-SNAPSHOT. + +It has a single column of int32 type with 1000 values and page index enabled. + +Both integer and null values are random generated. However, a null page is generated by purpose. + +# File Metadata (from parquet-cli meta command) +``` +File path: int32_with_null_pages.parquet +Created by: parquet-mr version 1.13.0-SNAPSHOT (build 433de8df33fcf31927f7b51456be9f53e64d48b9) +Properties: + writer.model.name: example +Schema: +message schema { + optional int32 int32_field; +} + + +Row group 0: count: 1000 3.33 B records start: 4 total(compressed): 3.250 kB total(uncompressed):3.250 kB +-------------------------------------------------------------------------------- + type encodings count avg size nulls min / max +int32_field INT32 _ _ 1000 3.33 B 275 "-2136906554" / "2145722375" +``` + +# Column Index (from parquet-cli column-index command) +``` +row-group 0: +column index for column int32_field: +Boundary order: UNORDERED + null count min max +page-0 8 -2135807632 2144701119 +page-1 55 -2104090659 1745329571 +page-2 100 +page-3 52 -2116849709 2077105757 +page-4 16 -2048691758 2143189382 +page-5 12 -2017923401 2087827129 +page-6 5 -2136906554 2125689411 +page-7 7 -2113313110 2145722375 +page-8 8 -2046900272 2087168549 +page-9 12 -1941944785 2078586537 + +offset index for column int32_field: + offset compressed size first row index +page-0 4 415 0 +page-1 419 220 100 +page-2 639 31 200 +page-3 670 228 300 +page-4 898 382 400 +page-5 1280 402 500 +page-6 1682 422 600 +page-7 2104 411 700 +page-8 2515 417 800 +page-9 2932 400 900 +``` diff --git a/data/int32_with_null_pages.parquet b/data/int32_with_null_pages.parquet new file mode 100644 index 0000000000000000000000000000000000000000..82637745ae557f57e509c7abc1ac780913cf3617 GIT binary patch literal 3829 zcmY*cXH*kg7fnbANeDd|h$2LUKtw<~L8M3*X@Ur%QbT(bDHa5zsI&(NQbJRDm8Nuf zfS?qSCW1=u#YY5`A_xgz;`hflv)9a;xo6g_efGWQ>}#TNSsDYvM8g<}sSGQE@nd?x z^fxdJ90cM5fk4a~GeTe%+DtGv<4Qw=T|D=`hzhDNu4HUd2iMzp3y_!ZaiN(`+)Iqd z8G1tY9qplOrK)^Qmi8_Yyfxy|sdr2Wa^wH@Hjq2BUc|yH(wfQO2yH$f;>9yoc~}d~ zfdy%xRYxm(f9u9eyiz~t?%VRe6ehPTmYAn@sGe5+N^`Rhs+_0#ZmcZ1hU--3In#8t z_C^)Fb?0r+%TdsAwF_A%+M-8;ucv%0zHq#XP)2;*EL%vvg5erZfsA5d8grq7I@34F z`GrG*G7#m3GflB!SKl9YOAO~89IrK4t-@|d`aS<*DpY@_bIHyPXW&T&_ z<-Cjn1WCK+>sKwMX}A8vi|z9k_d_JxI;pcyGL{nIm*%*x`%bzDI`2fT=?h1u#-kvH znb~Yy6w$PK57|{z6r`8W5pC4d@Z23%9;0&&BFi<(ay~y^bJV?4c9MoVMZHCHX z3G$$)ka?>%tECffWcXQyYE_-0tyy0CBOSpnIVG49=6YDyk^r$lNDbTZi!W{D48ERe zpOc=*%D=Q%_C0pUyg@jX@3|T)F8riXGXDI}hN`(JXw8AYh#sJ1Y2#YtG*| zBkviD3wLCCT6wy9-kiMsUN5$z{!WIyI*P5)9PgLb&#J_$mKOO^FIG%-WaWarINMsG zrrr#vu(Ho~?wyuvMbNNDq_A+?49ly5a5^`3`dQPTmdpGnGYFko4;VxrPJ%EHI+fW0 z{xbkZ4{!uQD$T(ky*`G1J%i^f5D8RdzPZ4-9r!;9{o_fGrDR%);qsRs@^#zq6e!ivYA4}7> z%s$!GZ8;Y%(8SVW-I^SZRn5BUnOF&ZTYSC|%0tjS-lYv#-;~sqIi%E0^I^TUekqP` zk8XqOlFp|g<0%_GBO~s*WXyLTHfu-iuG>=1Qo2n)e>U9jx*;$S(4aqI`_Hq@csj=( zW(G?8ieeFea^z&JzNuYFFq`Uz4b|oWCm152*6y7UGT{7kjPm_@UTtmVdE^wq^eNAP z)19N{Vef^<`&uIkW@_-mQ|k6nUJxqEmPhb)f|boj&wXlfqOmJaWWiaKwd7pj=ZI=m zU(7%tfwYtUy#th}5b*S`@}XF|;V9)>m_1?G@FAtDuQ(#@-J<)2L7CMmQzZV8l7 zkZ7*Xz4-RzaR1g|5gC(P&#>Bmj}5&7Uvs<4A#u%0C^Q>g5UXf&$dS|M$n?EVc6 zt8epjUeo32i`d(r_vY=MjWj!$Ui7jNFeW`Z_w!4indi2Dm${-V)kFC6zo$xi7e6x- zSI$NxS+2%DO!M(J``c_V>%IwM?WS#8CITo&hbI*W$IgaVe z2z7e)H<=k|s0*K9|1!K!mQ{wF-;E^9=G4n9f+njXSlK8!CSzT^J-EL-F6E=NM}B! zg)$FK2V*SnoHOuHrqoVN_XoU71c({JUTLZ*0W*F0W? z+tUMw!s3DP>i?jfq+L|Nt6zuGM#x@*6BKsk7S z3|94uRf}J%R7#hp5HHONdK3H|d-8G7pP%j)XH>+En$9p+`vQ5gqBUhVQmsd=E{itS zc?VkR-G6?VO=(~IvVUgTBF(0Ly&;0yagnFo@Gy0Bo)R$Z_RKy>Q6;bbVc}ni@`!$+pq

t{f-{U8J?E^6Ir@+8TVF}8jr(b>mRk3+zebBT$59hcGP%np&z^N8 zjYdvn5VohhhjsJc`9A-q=YSec%q85wOJiPKAcS1|I7HO-y&aKxFmI=>mZG;!K@f*B zUycQKIp%~6v-E__Z-tLG74AEZc^WI4GUiL9qr>hhOW8g)_U>nDuDk119g@eJYv~Tt6;P^tkZs*Ls<32$= z3Gt|faDJXLO!;Kq7`I9tEL*f9<-`wj7ncV4Q$~k`5`s;v(S^i}UF3cawD2+ZvF%e$ zQEK-0A{5JHW48T)1+G&o114|7-jnPwtKD+G)j8fI`pco3KPXdMIoH6>=d(~#tE}fx z*40U#Px_Ze!xTOQJEgJ)Ie(&}CHtQi%B(9S)eJR=0IqUUy!!cOrg6)^^qd5{&29aS z@#70uk-DYnx5Q4VyxA}_j=~M9F?*S>beB3I!YZUQBs(rzT7q$%h8;7G5Ula>pcUFh zb^aXP2W)|B#hhnP_O-krc86$7X1R{mNm&(JYwVm54@Z;Knf!O(R$={k8nxe#`SKr= zohf!p41#REL2`}vN@sT@PJ5(^ZQGnel@kk4NL5N=;0f5>vQ?hI5fEo6P#Xk-&;@}o-= z2V-D4a(?eWC5?wJ4P;+{UdMuufg!)deityyP+Z7h&i z5>!>N=Sf}>KB~MBZd^Xinp!GBO?g=2uqdpGg(VDhjNg)pe4l!!Dq!GY`WvToC52<_ z%8{!3esMuo5*2lp2-RlNovOA9R#ObIonvffcI~U`!q>xE|6M?>Dzjyj7&@e~vX#w| z%YXT>FzHm@0_np^qqSm$pwx>+9QQcE(^Xw%zZKiNiPfDFN|Fw}KgjXaZpTcsb}L0$ zh>dKayD#*@Q+U?cYnba8R<{Vb9((##=ofGueGb zv0BGjyhM2GS&sV^IlGU$YDN9v=yHv#+os55Cu1DW|x-2JB`Nq}|#TMTlZqfWi73U;$X+AJv z7kG#GAlD3OL51&(;Mp*nkKUOj>ygV#I-HF**`QC7Grjfy2Svpl$x~#?)>ZaD+x~3 zXB7cAo@!>pj|s<)#2ezvLqe)!9Bu&i zo2-?sZj}$Tu7qmqgffagl(CedjA;U85Vj0}V_IPV2n2M&FbxoZ z?s@?l0;3AVLqNbkNK6j|_yPq1kJvGX%s@5_1cab5`7mIW1q9?D!}P&`Dg+4F#9``S zz#b9=6pCUxV8Ck>2w0WCWW#_+b`S`|ggv7N^L20`dDvl?=#L^ye5|gXezGz)POc<( zM-VW?f#t{YhW*bth7kaS$kBm^tML71#P>THKp|A@k4F%epFX!i7`k-= zLKNuxNFY)UvqvXH|8|H252xShLs))3uDd?2ek31B4=+cOyQHU`2Z@h`bkEM?mOBZ= zXLifZ$NM(PPr}0ocZcNT>+0o+la`d0l{_n9W~5h6e>la+NODLFdH x$~rkX$x0Jt6rB|9<)!8173@hwCwW#j_Ed&?@GNEsH{|At0lPmxL literal 0 HcmV?d00001