@@ -849,6 +849,98 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
849849 return ret ;
850850}
851851
852+ static bool io_do_coalesce_buffer (struct page * * * pages , int * nr_pages ,
853+ struct io_imu_folio_data * data , int nr_folios )
854+ {
855+ struct page * * page_array = * pages , * * new_array = NULL ;
856+ int nr_pages_left = * nr_pages , i , j ;
857+
858+ /* Store head pages only*/
859+ new_array = kvmalloc_array (nr_folios , sizeof (struct page * ),
860+ GFP_KERNEL );
861+ if (!new_array )
862+ return false;
863+
864+ new_array [0 ] = compound_head (page_array [0 ]);
865+ /*
866+ * The pages are bound to the folio, it doesn't
867+ * actually unpin them but drops all but one reference,
868+ * which is usually put down by io_buffer_unmap().
869+ * Note, needs a better helper.
870+ */
871+ if (data -> nr_pages_head > 1 )
872+ unpin_user_pages (& page_array [1 ], data -> nr_pages_head - 1 );
873+
874+ j = data -> nr_pages_head ;
875+ nr_pages_left -= data -> nr_pages_head ;
876+ for (i = 1 ; i < nr_folios ; i ++ ) {
877+ unsigned int nr_unpin ;
878+
879+ new_array [i ] = page_array [j ];
880+ nr_unpin = min_t (unsigned int , nr_pages_left - 1 ,
881+ data -> nr_pages_mid - 1 );
882+ if (nr_unpin )
883+ unpin_user_pages (& page_array [j + 1 ], nr_unpin );
884+ j += data -> nr_pages_mid ;
885+ nr_pages_left -= data -> nr_pages_mid ;
886+ }
887+ kvfree (page_array );
888+ * pages = new_array ;
889+ * nr_pages = nr_folios ;
890+ return true;
891+ }
892+
893+ static bool io_try_coalesce_buffer (struct page * * * pages , int * nr_pages ,
894+ struct io_imu_folio_data * data )
895+ {
896+ struct page * * page_array = * pages ;
897+ struct folio * folio = page_folio (page_array [0 ]);
898+ unsigned int count = 1 , nr_folios = 1 ;
899+ int i ;
900+
901+ if (* nr_pages <= 1 )
902+ return false;
903+
904+ data -> nr_pages_mid = folio_nr_pages (folio );
905+ if (data -> nr_pages_mid == 1 )
906+ return false;
907+
908+ data -> folio_shift = folio_shift (folio );
909+ /*
910+ * Check if pages are contiguous inside a folio, and all folios have
911+ * the same page count except for the head and tail.
912+ */
913+ for (i = 1 ; i < * nr_pages ; i ++ ) {
914+ if (page_folio (page_array [i ]) == folio &&
915+ page_array [i ] == page_array [i - 1 ] + 1 ) {
916+ count ++ ;
917+ continue ;
918+ }
919+
920+ if (nr_folios == 1 ) {
921+ if (folio_page_idx (folio , page_array [i - 1 ]) !=
922+ data -> nr_pages_mid - 1 )
923+ return false;
924+
925+ data -> nr_pages_head = count ;
926+ } else if (count != data -> nr_pages_mid ) {
927+ return false;
928+ }
929+
930+ folio = page_folio (page_array [i ]);
931+ if (folio_size (folio ) != (1UL << data -> folio_shift ) ||
932+ folio_page_idx (folio , page_array [i ]) != 0 )
933+ return false;
934+
935+ count = 1 ;
936+ nr_folios ++ ;
937+ }
938+ if (nr_folios == 1 )
939+ data -> nr_pages_head = count ;
940+
941+ return io_do_coalesce_buffer (pages , nr_pages , data , nr_folios );
942+ }
943+
852944static int io_sqe_buffer_register (struct io_ring_ctx * ctx , struct iovec * iov ,
853945 struct io_mapped_ubuf * * pimu ,
854946 struct page * * last_hpage )
@@ -858,7 +950,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
858950 unsigned long off ;
859951 size_t size ;
860952 int ret , nr_pages , i ;
861- struct folio * folio = NULL ;
953+ struct io_imu_folio_data data ;
954+ bool coalesced ;
862955
863956 * pimu = (struct io_mapped_ubuf * )& dummy_ubuf ;
864957 if (!iov -> iov_base )
@@ -873,31 +966,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
873966 goto done ;
874967 }
875968
876- /* If it's a huge page, try to coalesce them into a single bvec entry */
877- if (nr_pages > 1 ) {
878- folio = page_folio (pages [0 ]);
879- for (i = 1 ; i < nr_pages ; i ++ ) {
880- /*
881- * Pages must be consecutive and on the same folio for
882- * this to work
883- */
884- if (page_folio (pages [i ]) != folio ||
885- pages [i ] != pages [i - 1 ] + 1 ) {
886- folio = NULL ;
887- break ;
888- }
889- }
890- if (folio ) {
891- /*
892- * The pages are bound to the folio, it doesn't
893- * actually unpin them but drops all but one reference,
894- * which is usually put down by io_buffer_unmap().
895- * Note, needs a better helper.
896- */
897- unpin_user_pages (& pages [1 ], nr_pages - 1 );
898- nr_pages = 1 ;
899- }
900- }
969+ /* If it's huge page(s), try to coalesce them into fewer bvec entries */
970+ coalesced = io_try_coalesce_buffer (& pages , & nr_pages , & data );
901971
902972 imu = kvmalloc (struct_size (imu , bvec , nr_pages ), GFP_KERNEL );
903973 if (!imu )
@@ -909,25 +979,25 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
909979 goto done ;
910980 }
911981
912- off = (unsigned long ) iov -> iov_base & ~PAGE_MASK ;
913982 size = iov -> iov_len ;
914983 /* store original address for later verification */
915984 imu -> ubuf = (unsigned long ) iov -> iov_base ;
916985 imu -> ubuf_end = imu -> ubuf + iov -> iov_len ;
917986 imu -> nr_bvecs = nr_pages ;
918987 imu -> folio_shift = PAGE_SHIFT ;
919988 imu -> folio_mask = PAGE_MASK ;
989+ if (coalesced ) {
990+ imu -> folio_shift = data .folio_shift ;
991+ imu -> folio_mask = ~((1UL << data .folio_shift ) - 1 );
992+ }
993+ off = (unsigned long ) iov -> iov_base & ~imu -> folio_mask ;
920994 * pimu = imu ;
921995 ret = 0 ;
922996
923- if (folio ) {
924- bvec_set_page (& imu -> bvec [0 ], pages [0 ], size , off );
925- goto done ;
926- }
927997 for (i = 0 ; i < nr_pages ; i ++ ) {
928998 size_t vec_len ;
929999
930- vec_len = min_t (size_t , size , PAGE_SIZE - off );
1000+ vec_len = min_t (size_t , size , ( 1UL << imu -> folio_shift ) - off );
9311001 bvec_set_page (& imu -> bvec [i ], pages [i ], vec_len , off );
9321002 off = 0 ;
9331003 size -= vec_len ;
0 commit comments