@@ -381,6 +381,78 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
381381 return failure ? budget : (int )total_rx_packets ;
382382}
383383
384+ static void i40e_xmit_pkt (struct i40e_ring * xdp_ring , struct xdp_desc * desc ,
385+ unsigned int * total_bytes )
386+ {
387+ struct i40e_tx_desc * tx_desc ;
388+ dma_addr_t dma ;
389+
390+ dma = xsk_buff_raw_get_dma (xdp_ring -> xsk_pool , desc -> addr );
391+ xsk_buff_raw_dma_sync_for_device (xdp_ring -> xsk_pool , dma , desc -> len );
392+
393+ tx_desc = I40E_TX_DESC (xdp_ring , xdp_ring -> next_to_use ++ );
394+ tx_desc -> buffer_addr = cpu_to_le64 (dma );
395+ tx_desc -> cmd_type_offset_bsz = build_ctob (I40E_TX_DESC_CMD_ICRC | I40E_TX_DESC_CMD_EOP ,
396+ 0 , desc -> len , 0 );
397+
398+ * total_bytes += desc -> len ;
399+ }
400+
401+ /* This value should match the pragma below. Why 4? It is strictly
402+ * empirical. It seems to be a good compromise between the advantage
403+ * of having simultaneous outstanding reads to the DMA array that can
404+ * hide each others latency and the disadvantage of having a larger
405+ * code path.
406+ */
407+ #define PKTS_PER_BATCH 4
408+
409+ static void i40e_xmit_pkt_batch (struct i40e_ring * xdp_ring , struct xdp_desc * desc ,
410+ unsigned int * total_bytes )
411+ {
412+ u16 ntu = xdp_ring -> next_to_use ;
413+ struct i40e_tx_desc * tx_desc ;
414+ dma_addr_t dma ;
415+ u32 i ;
416+
417+ #pragma GCC unroll 4
418+ for (i = 0 ; i < PKTS_PER_BATCH ; i ++ ) {
419+ dma = xsk_buff_raw_get_dma (xdp_ring -> xsk_pool , desc [i ].addr );
420+ xsk_buff_raw_dma_sync_for_device (xdp_ring -> xsk_pool , dma , desc [i ].len );
421+
422+ tx_desc = I40E_TX_DESC (xdp_ring , ntu ++ );
423+ tx_desc -> buffer_addr = cpu_to_le64 (dma );
424+ tx_desc -> cmd_type_offset_bsz = build_ctob (I40E_TX_DESC_CMD_ICRC |
425+ I40E_TX_DESC_CMD_EOP ,
426+ 0 , desc [i ].len , 0 );
427+
428+ * total_bytes += desc [i ].len ;
429+ }
430+
431+ xdp_ring -> next_to_use = ntu ;
432+ }
433+
434+ static void i40e_fill_tx_hw_ring (struct i40e_ring * xdp_ring , struct xdp_desc * descs , u32 nb_pkts ,
435+ unsigned int * total_bytes )
436+ {
437+ u32 batched , leftover , i ;
438+
439+ batched = nb_pkts & ~(PKTS_PER_BATCH - 1 );
440+ leftover = nb_pkts & (PKTS_PER_BATCH - 1 );
441+ for (i = 0 ; i < batched ; i += PKTS_PER_BATCH )
442+ i40e_xmit_pkt_batch (xdp_ring , & descs [i ], total_bytes );
443+ for (i = batched ; i < batched + leftover ; i ++ )
444+ i40e_xmit_pkt (xdp_ring , & descs [i ], total_bytes );
445+ }
446+
447+ static void i40e_set_rs_bit (struct i40e_ring * xdp_ring )
448+ {
449+ u16 ntu = xdp_ring -> next_to_use ? xdp_ring -> next_to_use - 1 : xdp_ring -> count - 1 ;
450+ struct i40e_tx_desc * tx_desc ;
451+
452+ tx_desc = I40E_TX_DESC (xdp_ring , ntu );
453+ tx_desc -> cmd_type_offset_bsz |= (I40E_TX_DESC_CMD_RS << I40E_TXD_QW1_CMD_SHIFT );
454+ }
455+
384456/**
385457 * i40e_xmit_zc - Performs zero-copy Tx AF_XDP
386458 * @xdp_ring: XDP Tx ring
@@ -390,45 +462,30 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
390462 **/
391463static bool i40e_xmit_zc (struct i40e_ring * xdp_ring , unsigned int budget )
392464{
393- unsigned int sent_frames = 0 , total_bytes = 0 ;
394- struct i40e_tx_desc * tx_desc = NULL ;
395- struct xdp_desc desc ;
396- dma_addr_t dma ;
397-
398- while (budget -- > 0 ) {
399- if (!xsk_tx_peek_desc (xdp_ring -> xsk_pool , & desc ))
400- break ;
401-
402- dma = xsk_buff_raw_get_dma (xdp_ring -> xsk_pool , desc .addr );
403- xsk_buff_raw_dma_sync_for_device (xdp_ring -> xsk_pool , dma ,
404- desc .len );
405-
406- tx_desc = I40E_TX_DESC (xdp_ring , xdp_ring -> next_to_use );
407- tx_desc -> buffer_addr = cpu_to_le64 (dma );
408- tx_desc -> cmd_type_offset_bsz =
409- build_ctob (I40E_TX_DESC_CMD_ICRC
410- | I40E_TX_DESC_CMD_EOP ,
411- 0 , desc .len , 0 );
412-
413- sent_frames ++ ;
414- total_bytes += desc .len ;
415-
416- xdp_ring -> next_to_use ++ ;
417- if (xdp_ring -> next_to_use == xdp_ring -> count )
418- xdp_ring -> next_to_use = 0 ;
465+ struct xdp_desc * descs = xdp_ring -> xsk_descs ;
466+ u32 nb_pkts , nb_processed = 0 ;
467+ unsigned int total_bytes = 0 ;
468+
469+ nb_pkts = xsk_tx_peek_release_desc_batch (xdp_ring -> xsk_pool , descs , budget );
470+ if (!nb_pkts )
471+ return false;
472+
473+ if (xdp_ring -> next_to_use + nb_pkts >= xdp_ring -> count ) {
474+ nb_processed = xdp_ring -> count - xdp_ring -> next_to_use ;
475+ i40e_fill_tx_hw_ring (xdp_ring , descs , nb_processed , & total_bytes );
476+ xdp_ring -> next_to_use = 0 ;
419477 }
420478
421- if (tx_desc ) {
422- /* Request an interrupt for the last frame and bump tail ptr. */
423- tx_desc -> cmd_type_offset_bsz |= (I40E_TX_DESC_CMD_RS <<
424- I40E_TXD_QW1_CMD_SHIFT );
425- i40e_xdp_ring_update_tail (xdp_ring );
479+ i40e_fill_tx_hw_ring (xdp_ring , & descs [nb_processed ], nb_pkts - nb_processed ,
480+ & total_bytes );
426481
427- xsk_tx_release (xdp_ring -> xsk_pool );
428- i40e_update_tx_stats (xdp_ring , sent_frames , total_bytes );
429- }
482+ /* Request an interrupt for the last frame and bump tail ptr. */
483+ i40e_set_rs_bit (xdp_ring );
484+ i40e_xdp_ring_update_tail (xdp_ring );
485+
486+ i40e_update_tx_stats (xdp_ring , nb_pkts , total_bytes );
430487
431- return !! budget ;
488+ return true ;
432489}
433490
434491/**
0 commit comments