limit record_batch row count

de-sh · de-sh · commit 913979a3d4fb · 2025-02-22T16:01:47.000+05:30
diff --git a/src/parseable/staging/mod.rs b/src/parseable/staging/mod.rs
@@ -30,6 +30,6 @@ pub enum StagingError {
     ObjectStorage(#[from] std::io::Error),
     #[error("Could not generate parquet file")]
     Create,
-    // #[error("Metadata Error: {0}")]
-    // Metadata(#[from] MetadataError),
+    #[error("Too many rows: {0}")]
+    RowLimit(usize),
 }
diff --git a/src/parseable/staging/writer.rs b/src/parseable/staging/writer.rs
@@ -37,13 +37,13 @@ use crate::utils::arrow::adapt_batch;
 use super::StagingError;
 
 /// Context regarding `.arrows` file being persisted onto disk
-pub struct DiskWriter<const N: usize> {
+pub struct DiskWriter {
     inner: FileWriter<BufWriter<File>>,
     /// Used to ensure un"finish"ed arrow files are renamed on "finish"
     path_prefix: String,
 }
 
-impl<const N: usize> DiskWriter<N> {
+impl DiskWriter {
     pub fn new(path_prefix: String, schema: &Schema) -> Result<Self, StagingError> {
         // Live writes happen into partfile
         let partfile_path = format!("{path_prefix}.{ARROW_PART_FILE_EXTENSION}");
@@ -81,12 +81,6 @@ impl<const N: usize> DiskWriter<N> {
     }
 }
 
-#[derive(Default)]
-pub struct Writer<const N: usize> {
-    pub mem: MemWriter<N>,
-    pub disk: HashMap<String, DiskWriter<N>>,
-}
-
 /// Structure to keep recordbatches in memory.
 ///
 /// Any new schema is updated in the schema map.
@@ -178,3 +172,9 @@ impl<const N: usize> MutableBuffer<N> {
         }
     }
 }
+
+#[derive(Default)]
+pub struct Writer<const N: usize> {
+    pub mem: MemWriter<N>,
+    pub disk: HashMap<String, DiskWriter>,
+}
diff --git a/src/parseable/streams.rs b/src/parseable/streams.rs
@@ -66,6 +66,9 @@ use super::{
     LogStream, ARROW_FILE_EXTENSION,
 };
 
+// ~16K rows is default in-memory limit for each recordbatch
+const MAX_RECORD_BATCH_SIZE: usize = 16384;
+
 /// Regex pattern for parsing arrow file names.
 ///
 /// # Format
@@ -113,8 +116,8 @@ pub struct Stream {
     pub metadata: RwLock<LogStreamMetadata>,
     pub data_path: PathBuf,
     pub options: Arc<Options>,
-    /// Writer with a 16KB buffer size for optimal I/O performance.
-    pub writer: Mutex<Writer<16384>>,
+    /// Writer with a ~16K rows limit for optimal I/O performance.
+    pub writer: Mutex<Writer<MAX_RECORD_BATCH_SIZE>>,
     pub ingestor_id: Option<String>,
 }
 
@@ -147,6 +150,11 @@ impl Stream {
         custom_partition_values: &HashMap<String, String>,
         stream_type: StreamType,
     ) -> Result<(), StagingError> {
+        let row_count = record.num_rows();
+        if row_count > MAX_RECORD_BATCH_SIZE {
+            return Err(StagingError::RowLimit(row_count));
+        }
+
         let mut guard = self.writer.lock().unwrap();
         if self.options.mode != Mode::Query || stream_type == StreamType::Internal {
             match guard.disk.get_mut(schema_key) {

Original file line number	Diff line number	Diff line change
`@@ -30,6 +30,6 @@ pub enum StagingError {`
`30`	`30`	`ObjectStorage(#[from] std::io::Error),`
`31`	`31`	`#[error("Could not generate parquet file")]`
`32`	`32`	`Create,`
`33`		`- // #[error("Metadata Error: {0}")]`
`34`		`- // Metadata(#[from] MetadataError),`
	`33`	`+ #[error("Too many rows: {0}")]`
	`34`	`+ RowLimit(usize),`
`35`	`35`	`}`
Original file line number	Diff line number	Diff line change
`@@ -37,13 +37,13 @@ use crate::utils::arrow::adapt_batch;`
`37`	`37`	`use super::StagingError;`
`38`	`38`
`39`	`39`	/// Context regarding `.arrows` file being persisted onto disk
`40`		`-pub struct DiskWriter<const N: usize> {`
	`40`	`+pub struct DiskWriter {`
`41`	`41`	`inner: FileWriter<BufWriter<File>>,`
`42`	`42`	`/// Used to ensure un"finish"ed arrow files are renamed on "finish"`
`43`	`43`	`path_prefix: String,`
`44`	`44`	`}`
`45`	`45`
`46`		`-impl<const N: usize> DiskWriter<N> {`
	`46`	`+impl DiskWriter {`
`47`	`47`	`pub fn new(path_prefix: String, schema: &Schema) -> Result<Self, StagingError> {`
`48`	`48`	`// Live writes happen into partfile`
`49`	`49`	`let partfile_path = format!("{path_prefix}.{ARROW_PART_FILE_EXTENSION}");`
`@@ -81,12 +81,6 @@ impl<const N: usize> DiskWriter<N> {`
`81`	`81`	`}`
`82`	`82`	`}`
`83`	`83`
`84`		`-#[derive(Default)]`
`85`		`-pub struct Writer<const N: usize> {`
`86`		`- pub mem: MemWriter<N>,`
`87`		`- pub disk: HashMap<String, DiskWriter<N>>,`
`88`		`-}`
`89`		`-`
`90`	`84`	`/// Structure to keep recordbatches in memory.`
`91`	`85`	`///`
`92`	`86`	`/// Any new schema is updated in the schema map.`
`@@ -178,3 +172,9 @@ impl<const N: usize> MutableBuffer<N> {`
`178`	`172`	`}`
`179`	`173`	`}`
`180`	`174`	`}`
	`175`	`+`
	`176`	`+#[derive(Default)]`
	`177`	`+pub struct Writer<const N: usize> {`
	`178`	`+ pub mem: MemWriter<N>,`
	`179`	`+ pub disk: HashMap<String, DiskWriter>,`
	`180`	`+}`