@@ -34,7 +34,10 @@ use itertools::Itertools;
3434use parquet:: {
3535 arrow:: ArrowWriter ,
3636 basic:: Encoding ,
37- file:: { FOOTER_SIZE , properties:: WriterProperties } ,
37+ file:: {
38+ FOOTER_SIZE , properties:: WriterProperties , reader:: FileReader ,
39+ serialized_reader:: SerializedFileReader ,
40+ } ,
3841 format:: SortingColumn ,
3942 schema:: types:: ColumnPath ,
4043} ;
@@ -409,7 +412,7 @@ impl Stream {
409412 . map ( |file| file. path ( ) )
410413 . filter ( |file| {
411414 file. extension ( ) . is_some_and ( |ext| ext. eq ( "parquet" ) )
412- && std :: fs :: metadata ( file) . is_ok_and ( |meta| meta . len ( ) > FOOTER_SIZE as u64 )
415+ && Self :: is_valid_parquet_file ( file, & self . stream_name )
413416 } )
414417 . collect ( )
415418 }
@@ -649,7 +652,7 @@ impl Stream {
649652 continue ;
650653 }
651654
652- if let Err ( e) = self . finalize_parquet_file ( & part_path, & parquet_path) {
655+ if let Err ( e) = std :: fs :: rename ( & part_path, & parquet_path) {
653656 error ! ( "Couldn't rename part file: {part_path:?} -> {parquet_path:?}, error = {e}" ) ;
654657 } else {
655658 self . cleanup_arrow_files_and_dir ( & arrow_files) ;
@@ -682,12 +685,10 @@ impl Stream {
682685 }
683686 writer. close ( ) ?;
684687
685- if part_file. metadata ( ) . expect ( "File was just created" ) . len ( )
686- < parquet:: file:: FOOTER_SIZE as u64
687- {
688+ if !Self :: is_valid_parquet_file ( part_path, & self . stream_name ) {
688689 error ! (
689- "Invalid parquet file {part_path:?} detected for stream {}, removing it" ,
690- & self . stream_name
690+ "Invalid parquet file {part_path:?} detected for stream {stream_name }, removing it" ,
691+ stream_name = & self . stream_name
691692 ) ;
692693 remove_file ( part_path) . expect ( "File should be removable if it is invalid" ) ;
693694 return Ok ( false ) ;
@@ -696,8 +697,47 @@ impl Stream {
696697 Ok ( true )
697698 }
698699
699- fn finalize_parquet_file ( & self , part_path : & Path , parquet_path : & Path ) -> std:: io:: Result < ( ) > {
700- std:: fs:: rename ( part_path, parquet_path)
700+ /// function to validate parquet files
701+ fn is_valid_parquet_file ( path : & Path , stream_name : & str ) -> bool {
702+ // First check file size as a quick validation
703+ match path. metadata ( ) {
704+ Ok ( meta) if meta. len ( ) < FOOTER_SIZE as u64 => {
705+ error ! (
706+ "Invalid parquet file {path:?} detected for stream {stream_name}, size: {} bytes" ,
707+ meta. len( )
708+ ) ;
709+ return false ;
710+ }
711+ Err ( e) => {
712+ error ! (
713+ "Cannot read metadata for parquet file {path:?} for stream {stream_name}: {e}"
714+ ) ;
715+ return false ;
716+ }
717+ _ => { } // File size is adequate, continue validation
718+ }
719+
720+ // Try to open and read the parquet file metadata to verify it's valid
721+ match std:: fs:: File :: open ( path) {
722+ Ok ( file) => match SerializedFileReader :: new ( file) {
723+ Ok ( reader) => {
724+ if reader. metadata ( ) . file_metadata ( ) . num_rows ( ) == 0 {
725+ error ! ( "Invalid parquet file {path:?} for stream {stream_name}" ) ;
726+ false
727+ } else {
728+ true
729+ }
730+ }
731+ Err ( e) => {
732+ error ! ( "Failed to read parquet file {path:?} for stream {stream_name}: {e}" ) ;
733+ false
734+ }
735+ } ,
736+ Err ( e) => {
737+ error ! ( "Failed to open parquet file {path:?} for stream {stream_name}: {e}" ) ;
738+ false
739+ }
740+ }
701741 }
702742
703743 fn cleanup_arrow_files_and_dir ( & self , arrow_files : & [ PathBuf ] ) {
@@ -951,7 +991,10 @@ impl Stream {
951991 shutdown_signal : bool ,
952992 ) -> Result < ( ) , StagingError > {
953993 let start_flush = Instant :: now ( ) ;
954- self . flush ( shutdown_signal) ;
994+ // Force flush for init or shutdown signals to convert all .part files to .arrows
995+ // For regular cycles, use false to only flush non-current writers
996+ let forced = init_signal || shutdown_signal;
997+ self . flush ( forced) ;
955998 trace ! (
956999 "Flushing stream ({}) took: {}s" ,
9571000 self . stream_name,
0 commit comments