@@ -849,6 +849,98 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
849849	return  ret ;
850850}
851851
852+ static  bool  io_do_coalesce_buffer (struct  page  * * * pages , int  * nr_pages ,
853+ 				struct  io_imu_folio_data  * data , int  nr_folios )
854+ {
855+ 	struct  page  * * page_array  =  * pages , * * new_array  =  NULL ;
856+ 	int  nr_pages_left  =  * nr_pages , i , j ;
857+ 
858+ 	/* Store head pages only*/ 
859+ 	new_array  =  kvmalloc_array (nr_folios , sizeof (struct  page  * ),
860+ 					GFP_KERNEL );
861+ 	if  (!new_array )
862+ 		return  false;
863+ 
864+ 	new_array [0 ] =  compound_head (page_array [0 ]);
865+ 	/* 
866+ 	 * The pages are bound to the folio, it doesn't 
867+ 	 * actually unpin them but drops all but one reference, 
868+ 	 * which is usually put down by io_buffer_unmap(). 
869+ 	 * Note, needs a better helper. 
870+ 	 */ 
871+ 	if  (data -> nr_pages_head  >  1 )
872+ 		unpin_user_pages (& page_array [1 ], data -> nr_pages_head  -  1 );
873+ 
874+ 	j  =  data -> nr_pages_head ;
875+ 	nr_pages_left  -=  data -> nr_pages_head ;
876+ 	for  (i  =  1 ; i  <  nr_folios ; i ++ ) {
877+ 		unsigned int   nr_unpin ;
878+ 
879+ 		new_array [i ] =  page_array [j ];
880+ 		nr_unpin  =  min_t (unsigned  int , nr_pages_left  -  1 ,
881+ 					data -> nr_pages_mid  -  1 );
882+ 		if  (nr_unpin )
883+ 			unpin_user_pages (& page_array [j + 1 ], nr_unpin );
884+ 		j  +=  data -> nr_pages_mid ;
885+ 		nr_pages_left  -=  data -> nr_pages_mid ;
886+ 	}
887+ 	kvfree (page_array );
888+ 	* pages  =  new_array ;
889+ 	* nr_pages  =  nr_folios ;
890+ 	return  true;
891+ }
892+ 
893+ static  bool  io_try_coalesce_buffer (struct  page  * * * pages , int  * nr_pages ,
894+ 					 struct  io_imu_folio_data  * data )
895+ {
896+ 	struct  page  * * page_array  =  * pages ;
897+ 	struct  folio  * folio  =  page_folio (page_array [0 ]);
898+ 	unsigned int   count  =  1 , nr_folios  =  1 ;
899+ 	int  i ;
900+ 
901+ 	if  (* nr_pages  <= 1 )
902+ 		return  false;
903+ 
904+ 	data -> nr_pages_mid  =  folio_nr_pages (folio );
905+ 	if  (data -> nr_pages_mid  ==  1 )
906+ 		return  false;
907+ 
908+ 	data -> folio_shift  =  folio_shift (folio );
909+ 	/* 
910+ 	 * Check if pages are contiguous inside a folio, and all folios have 
911+ 	 * the same page count except for the head and tail. 
912+ 	 */ 
913+ 	for  (i  =  1 ; i  <  * nr_pages ; i ++ ) {
914+ 		if  (page_folio (page_array [i ]) ==  folio  && 
915+ 			page_array [i ] ==  page_array [i - 1 ] +  1 ) {
916+ 			count ++ ;
917+ 			continue ;
918+ 		}
919+ 
920+ 		if  (nr_folios  ==  1 ) {
921+ 			if  (folio_page_idx (folio , page_array [i - 1 ]) != 
922+ 				data -> nr_pages_mid  -  1 )
923+ 				return  false;
924+ 
925+ 			data -> nr_pages_head  =  count ;
926+ 		} else  if  (count  !=  data -> nr_pages_mid ) {
927+ 			return  false;
928+ 		}
929+ 
930+ 		folio  =  page_folio (page_array [i ]);
931+ 		if  (folio_size (folio ) !=  (1UL  << data -> folio_shift ) || 
932+ 			folio_page_idx (folio , page_array [i ]) !=  0 )
933+ 			return  false;
934+ 
935+ 		count  =  1 ;
936+ 		nr_folios ++ ;
937+ 	}
938+ 	if  (nr_folios  ==  1 )
939+ 		data -> nr_pages_head  =  count ;
940+ 
941+ 	return  io_do_coalesce_buffer (pages , nr_pages , data , nr_folios );
942+ }
943+ 
852944static  int  io_sqe_buffer_register (struct  io_ring_ctx  * ctx , struct  iovec  * iov ,
853945				  struct  io_mapped_ubuf  * * pimu ,
854946				  struct  page  * * last_hpage )
@@ -858,7 +950,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
858950	unsigned long  off ;
859951	size_t  size ;
860952	int  ret , nr_pages , i ;
861- 	struct  folio  * folio  =  NULL ;
953+ 	struct  io_imu_folio_data  data ;
954+ 	bool  coalesced ;
862955
863956	* pimu  =  (struct  io_mapped_ubuf  * )& dummy_ubuf ;
864957	if  (!iov -> iov_base )
@@ -873,31 +966,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
873966		goto done ;
874967	}
875968
876- 	/* If it's a huge page, try to coalesce them into a single bvec entry */ 
877- 	if  (nr_pages  >  1 ) {
878- 		folio  =  page_folio (pages [0 ]);
879- 		for  (i  =  1 ; i  <  nr_pages ; i ++ ) {
880- 			/* 
881- 			 * Pages must be consecutive and on the same folio for 
882- 			 * this to work 
883- 			 */ 
884- 			if  (page_folio (pages [i ]) !=  folio  || 
885- 			    pages [i ] !=  pages [i  -  1 ] +  1 ) {
886- 				folio  =  NULL ;
887- 				break ;
888- 			}
889- 		}
890- 		if  (folio ) {
891- 			/* 
892- 			 * The pages are bound to the folio, it doesn't 
893- 			 * actually unpin them but drops all but one reference, 
894- 			 * which is usually put down by io_buffer_unmap(). 
895- 			 * Note, needs a better helper. 
896- 			 */ 
897- 			unpin_user_pages (& pages [1 ], nr_pages  -  1 );
898- 			nr_pages  =  1 ;
899- 		}
900- 	}
969+ 	/* If it's huge page(s), try to coalesce them into fewer bvec entries */ 
970+ 	coalesced  =  io_try_coalesce_buffer (& pages , & nr_pages , & data );
901971
902972	imu  =  kvmalloc (struct_size (imu , bvec , nr_pages ), GFP_KERNEL );
903973	if  (!imu )
@@ -909,25 +979,25 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
909979		goto done ;
910980	}
911981
912- 	off  =  (unsigned long ) iov -> iov_base  &  ~PAGE_MASK ;
913982	size  =  iov -> iov_len ;
914983	/* store original address for later verification */ 
915984	imu -> ubuf  =  (unsigned long ) iov -> iov_base ;
916985	imu -> ubuf_end  =  imu -> ubuf  +  iov -> iov_len ;
917986	imu -> nr_bvecs  =  nr_pages ;
918987	imu -> folio_shift  =  PAGE_SHIFT ;
919988	imu -> folio_mask  =  PAGE_MASK ;
989+ 	if  (coalesced ) {
990+ 		imu -> folio_shift  =  data .folio_shift ;
991+ 		imu -> folio_mask  =  ~((1UL  << data .folio_shift ) -  1 );
992+ 	}
993+ 	off  =  (unsigned long ) iov -> iov_base  &  ~imu -> folio_mask ;
920994	* pimu  =  imu ;
921995	ret  =  0 ;
922996
923- 	if  (folio ) {
924- 		bvec_set_page (& imu -> bvec [0 ], pages [0 ], size , off );
925- 		goto done ;
926- 	}
927997	for  (i  =  0 ; i  <  nr_pages ; i ++ ) {
928998		size_t  vec_len ;
929999
930- 		vec_len  =  min_t (size_t , size , PAGE_SIZE  -  off );
1000+ 		vec_len  =  min_t (size_t , size , ( 1UL  <<  imu -> folio_shift )  -  off );
9311001		bvec_set_page (& imu -> bvec [i ], pages [i ], vec_len , off );
9321002		off  =  0 ;
9331003		size  -=  vec_len ;
0 commit comments