@@ -47,53 +47,58 @@ use std::ops::Range;
4747/// requires the most IO operations - one to read the footer and then one
4848/// to read the metadata, and possibly more if page indexes are requested.
4949///
50- /// ```rust
51- /// # use std::ops::Range;
52- /// # use bytes::Bytes;
53- /// # use arrow_array::record_batch;
54- /// # use parquet::DecodeResult;
55- /// # use parquet::arrow::ArrowWriter;
56- /// # use parquet::errors::ParquetError;
57- /// # use parquet::file::metadata::{ParquetMetaData, ParquetMetaDataPushDecoder};
58- /// #
59- /// # fn decode_metadata() -> Result<ParquetMetaData, ParquetError> {
60- /// # let file_bytes = {
61- /// # let mut buffer = vec![0];
62- /// # let batch = record_batch!(("a", Int32, [1, 2, 3])).unwrap();
63- /// # let mut writer = ArrowWriter::try_new(&mut buffer, batch.schema(), None).unwrap();
64- /// # writer.write(&batch).unwrap();
65- /// # writer.close().unwrap();
66- /// # Bytes::from(buffer)
67- /// # };
68- /// # // mimic IO by returning a function that returns the bytes for a given range
69- /// # let get_range = |range: &Range<u64>| -> Bytes {
70- /// # let start = range.start as usize;
71- /// # let end = range.end as usize;
72- /// # file_bytes.slice(start..end)
73- /// # };
74- /// #
75- /// # let file_len = file_bytes.len() as u64;
76- /// // The `ParquetMetaDataPushDecoder` needs to know the file length.
77- /// let mut decoder = ParquetMetaDataPushDecoder::try_new(file_len).unwrap();
78- /// // try to decode the metadata. If more data is needed, the decoder will tell you what ranges
79- /// loop {
80- /// match decoder.try_decode() {
81- /// Ok(DecodeResult::Data(metadata)) => { return Ok(metadata); } // decode successful
82- /// Ok(DecodeResult::NeedsData(ranges)) => {
83- /// // The decoder needs more data
84- /// //
85- /// // In this example, we call a function that returns the bytes for each given range.
86- /// // In a real application, you would likely read the data from a file or network.
87- /// let data = ranges.iter().map(|range| get_range(range)).collect();
88- /// // Push the data into the decoder and try to decode again on the next iteration.
89- /// decoder.push_ranges(ranges, data).unwrap();
90- /// }
91- /// Ok(DecodeResult::Finished) => { unreachable!("returned metadata in previous match arm") }
92- /// Err(e) => return Err(e),
93- /// }
94- /// }
95- /// # }
96- /// ```
50+ #[ cfg_attr(
51+ feature = "arrow" ,
52+ doc = r##"
53+ ```rust
54+ # use std::ops::Range;
55+ # use bytes::Bytes;
56+ # use arrow_array::record_batch;
57+ # use parquet::DecodeResult;
58+ # use parquet::arrow::ArrowWriter;
59+ # use parquet::errors::ParquetError;
60+ # use parquet::file::metadata::{ParquetMetaData, ParquetMetaDataPushDecoder};
61+ #
62+ # fn decode_metadata() -> Result<ParquetMetaData, ParquetError> {
63+ # let file_bytes = {
64+ # let mut buffer = vec![0];
65+ # let batch = record_batch!(("a", Int32, [1, 2, 3])).unwrap();
66+ # let mut writer = ArrowWriter::try_new(&mut buffer, batch.schema(), None).unwrap();
67+ # writer.write(&batch).unwrap();
68+ # writer.close().unwrap();
69+ # Bytes::from(buffer)
70+ # };
71+ # // mimic IO by returning a function that returns the bytes for a given range
72+ # let get_range = |range: &Range<u64>| -> Bytes {
73+ # let start = range.start as usize;
74+ # let end = range.end as usize;
75+ # file_bytes.slice(start..end)
76+ # };
77+ #
78+ # let file_len = file_bytes.len() as u64;
79+ // The `ParquetMetaDataPushDecoder` needs to know the file length.
80+ let mut decoder = ParquetMetaDataPushDecoder::try_new(file_len).unwrap();
81+ // try to decode the metadata. If more data is needed, the decoder will tell you what ranges
82+ loop {
83+ match decoder.try_decode() {
84+ Ok(DecodeResult::Data(metadata)) => { return Ok(metadata); } // decode successful
85+ Ok(DecodeResult::NeedsData(ranges)) => {
86+ // The decoder needs more data
87+ //
88+ // In this example, we call a function that returns the bytes for each given range.
89+ // In a real application, you would likely read the data from a file or network.
90+ let data = ranges.iter().map(|range| get_range(range)).collect();
91+ // Push the data into the decoder and try to decode again on the next iteration.
92+ decoder.push_ranges(ranges, data).unwrap();
93+ }
94+ Ok(DecodeResult::Finished) => { unreachable!("returned metadata in previous match arm") }
95+ Err(e) => return Err(e),
96+ }
97+ }
98+ # }
99+ ```
100+ "##
101+ ) ]
97102///
98103/// # Example with "prefetching"
99104///
@@ -114,44 +119,48 @@ use std::ops::Range;
114119///
115120/// This approach can also be used when you have the entire file already in memory
116121/// for other reasons.
117- ///
118- /// ```rust
119- /// # use std::ops::Range;
120- /// # use bytes::Bytes;
121- /// # use arrow_array::record_batch;
122- /// # use parquet::DecodeResult;
123- /// # use parquet::arrow::ArrowWriter;
124- /// # use parquet::errors::ParquetError;
125- /// # use parquet::file::metadata::{ParquetMetaData, ParquetMetaDataPushDecoder};
126- /// #
127- /// # fn decode_metadata() -> Result<ParquetMetaData, ParquetError> {
128- /// # let file_bytes = {
129- /// # let mut buffer = vec![0];
130- /// # let batch = record_batch!(("a", Int32, [1, 2, 3])).unwrap();
131- /// # let mut writer = ArrowWriter::try_new(&mut buffer, batch.schema(), None).unwrap();
132- /// # writer.write(&batch).unwrap();
133- /// # writer.close().unwrap();
134- /// # Bytes::from(buffer)
135- /// # };
136- /// #
137- /// let file_len = file_bytes.len() as u64;
138- /// // For this example, we "prefetch" all the bytes which we have in memory,
139- /// // but in a real application, you would likely read a chunk from the end
140- /// // for example 1MB.
141- /// let prefetched_bytes = file_bytes.clone();
142- /// let mut decoder = ParquetMetaDataPushDecoder::try_new(file_len).unwrap();
143- /// // push the prefetched bytes into the decoder
144- /// decoder.push_ranges(vec![0..file_len], vec![prefetched_bytes]).unwrap();
145- /// // The decoder will now be able to decode the metadata. Note in a real application,
146- /// // unless you can guarantee that the pushed data is enough to decode the metadata,
147- /// // you still need to call `try_decode` in a loop until it returns `DecodeResult::Data`
148- /// // as shown in the previous example
149- /// match decoder.try_decode() {
150- /// Ok(DecodeResult::Data(metadata)) => { return Ok(metadata); } // decode successful
151- /// other => { panic!("expected DecodeResult::Data, got: {other:?}") }
152- /// }
153- /// # }
154- /// ```
122+ #[ cfg_attr(
123+ feature = "arrow" ,
124+ doc = r##"
125+ ```rust
126+ # use std::ops::Range;
127+ # use bytes::Bytes;
128+ # use arrow_array::record_batch;
129+ # use parquet::DecodeResult;
130+ # use parquet::arrow::ArrowWriter;
131+ # use parquet::errors::ParquetError;
132+ # use parquet::file::metadata::{ParquetMetaData, ParquetMetaDataPushDecoder};
133+ #
134+ # fn decode_metadata() -> Result<ParquetMetaData, ParquetError> {
135+ # let file_bytes = {
136+ # let mut buffer = vec![0];
137+ # let batch = record_batch!(("a", Int32, [1, 2, 3])).unwrap();
138+ # let mut writer = ArrowWriter::try_new(&mut buffer, batch.schema(), None).unwrap();
139+ # writer.write(&batch).unwrap();
140+ # writer.close().unwrap();
141+ # Bytes::from(buffer)
142+ # };
143+ #
144+ let file_len = file_bytes.len() as u64;
145+ // For this example, we "prefetch" all the bytes which we have in memory,
146+ // but in a real application, you would likely read a chunk from the end
147+ // for example 1MB.
148+ let prefetched_bytes = file_bytes.clone();
149+ let mut decoder = ParquetMetaDataPushDecoder::try_new(file_len).unwrap();
150+ // push the prefetched bytes into the decoder
151+ decoder.push_ranges(vec![0..file_len], vec![prefetched_bytes]).unwrap();
152+ // The decoder will now be able to decode the metadata. Note in a real application,
153+ // unless you can guarantee that the pushed data is enough to decode the metadata,
154+ // you still need to call `try_decode` in a loop until it returns `DecodeResult::Data`
155+ // as shown in the previous example
156+ match decoder.try_decode() {
157+ Ok(DecodeResult::Data(metadata)) => { return Ok(metadata); } // decode successful
158+ other => { panic!("expected DecodeResult::Data, got: {other:?}") }
159+ }
160+ # }
161+ ```
162+ "##
163+ ) ]
155164///
156165/// # Example using [`AsyncRead`]
157166///
@@ -160,49 +169,53 @@ use std::ops::Range;
160169/// implement async IO itself. To use async IO, you simply write an async
161170/// wrapper around it that reads the required byte ranges and pushes them into the
162171/// decoder.
163- ///
164- /// ```rust
165- /// # use std::ops::Range;
166- /// # use bytes::Bytes;
167- /// use tokio::io::{AsyncRead, AsyncReadExt, AsyncSeek, AsyncSeekExt};
168- /// # use arrow_array::record_batch;
169- /// # use parquet::DecodeResult;
170- /// # use parquet::arrow::ArrowWriter;
171- /// # use parquet::errors::ParquetError;
172- /// # use parquet::file::metadata::{ParquetMetaData, ParquetMetaDataPushDecoder};
173- /// #
174- /// // This function decodes Parquet Metadata from anything that implements
175- /// // [`AsyncRead`] and [`AsyncSeek`] such as a tokio::fs::File
176- /// async fn decode_metadata(
177- /// file_len: u64,
178- /// mut async_source: impl AsyncRead + AsyncSeek + Unpin
179- /// ) -> Result<ParquetMetaData, ParquetError> {
180- /// // We need a ParquetMetaDataPushDecoder to decode the metadata.
181- /// let mut decoder = ParquetMetaDataPushDecoder::try_new(file_len).unwrap();
182- /// loop {
183- /// match decoder.try_decode() {
184- /// Ok(DecodeResult::Data(metadata)) => { return Ok(metadata); } // decode successful
185- /// Ok(DecodeResult::NeedsData(ranges)) => {
186- /// // The decoder needs more data
187- /// //
188- /// // In this example we use the AsyncRead and AsyncSeek traits to read the
189- /// // required ranges from the async source.
190- /// let mut data = Vec::with_capacity(ranges.len());
191- /// for range in &ranges {
192- /// let mut buffer = vec![0; (range.end - range.start) as usize];
193- /// async_source.seek(std::io::SeekFrom::Start(range.start)).await?;
194- /// async_source.read_exact(&mut buffer).await?;
195- /// data.push(Bytes::from(buffer));
196- /// }
197- /// // Push the data into the decoder and try to decode again on the next iteration.
198- /// decoder.push_ranges(ranges, data).unwrap();
199- /// }
200- /// Ok(DecodeResult::Finished) => { unreachable!("returned metadata in previous match arm") }
201- /// Err(e) => return Err(e),
202- /// }
203- /// }
204- /// }
205- /// ```
172+ #[ cfg_attr(
173+ feature = "arrow" ,
174+ doc = r##"
175+ ```rust
176+ # use std::ops::Range;
177+ # use bytes::Bytes;
178+ use tokio::io::{AsyncRead, AsyncReadExt, AsyncSeek, AsyncSeekExt};
179+ # use arrow_array::record_batch;
180+ # use parquet::DecodeResult;
181+ # use parquet::arrow::ArrowWriter;
182+ # use parquet::errors::ParquetError;
183+ # use parquet::file::metadata::{ParquetMetaData, ParquetMetaDataPushDecoder};
184+ #
185+ // This function decodes Parquet Metadata from anything that implements
186+ // [`AsyncRead`] and [`AsyncSeek`] such as a tokio::fs::File
187+ async fn decode_metadata(
188+ file_len: u64,
189+ mut async_source: impl AsyncRead + AsyncSeek + Unpin
190+ ) -> Result<ParquetMetaData, ParquetError> {
191+ // We need a ParquetMetaDataPushDecoder to decode the metadata.
192+ let mut decoder = ParquetMetaDataPushDecoder::try_new(file_len).unwrap();
193+ loop {
194+ match decoder.try_decode() {
195+ Ok(DecodeResult::Data(metadata)) => { return Ok(metadata); } // decode successful
196+ Ok(DecodeResult::NeedsData(ranges)) => {
197+ // The decoder needs more data
198+ //
199+ // In this example we use the AsyncRead and AsyncSeek traits to read the
200+ // required ranges from the async source.
201+ let mut data = Vec::with_capacity(ranges.len());
202+ for range in &ranges {
203+ let mut buffer = vec![0; (range.end - range.start) as usize];
204+ async_source.seek(std::io::SeekFrom::Start(range.start)).await?;
205+ async_source.read_exact(&mut buffer).await?;
206+ data.push(Bytes::from(buffer));
207+ }
208+ // Push the data into the decoder and try to decode again on the next iteration.
209+ decoder.push_ranges(ranges, data).unwrap();
210+ }
211+ Ok(DecodeResult::Finished) => { unreachable!("returned metadata in previous match arm") }
212+ Err(e) => return Err(e),
213+ }
214+ }
215+ }
216+ ```
217+ "##
218+ ) ]
206219/// [`AsyncRead`]: tokio::io::AsyncRead
207220#[ derive( Debug ) ]
208221pub struct ParquetMetaDataPushDecoder {
0 commit comments