@@ -38,6 +38,7 @@ struct ioam6_lwt_freq {
3838};
3939
4040struct ioam6_lwt {
41+ struct dst_entry null_dst ;
4142 struct dst_cache cache ;
4243 struct ioam6_lwt_freq freq ;
4344 atomic_t pkt_cnt ;
@@ -177,6 +178,14 @@ static int ioam6_build_state(struct net *net, struct nlattr *nla,
177178 if (err )
178179 goto free_lwt ;
179180
181+ /* This "fake" dst_entry will be stored in a dst_cache, which will call
182+ * dst_hold() and dst_release() on it. We must ensure that dst_destroy()
183+ * will never be called. For that, its initial refcount is 1 and +1 when
184+ * it is stored in the cache. Then, +1/-1 each time we read the cache
185+ * and release it. Long story short, we're fine.
186+ */
187+ dst_init (& ilwt -> null_dst , NULL , NULL , DST_OBSOLETE_NONE , DST_NOCOUNT );
188+
180189 atomic_set (& ilwt -> pkt_cnt , 0 );
181190 ilwt -> freq .k = freq_k ;
182191 ilwt -> freq .n = freq_n ;
@@ -336,33 +345,45 @@ static int ioam6_do_encap(struct net *net, struct sk_buff *skb,
336345
337346static int ioam6_output (struct net * net , struct sock * sk , struct sk_buff * skb )
338347{
339- struct dst_entry * dst = skb_dst (skb ), * cache_dst = NULL ;
348+ struct dst_entry * orig_dst = skb_dst (skb );
349+ struct dst_entry * dst = NULL ;
340350 struct ioam6_lwt * ilwt ;
341351 int err = - EINVAL ;
342352 u32 pkt_cnt ;
343353
344354 if (skb -> protocol != htons (ETH_P_IPV6 ))
345355 goto drop ;
346356
347- ilwt = ioam6_lwt_state (dst -> lwtstate );
357+ ilwt = ioam6_lwt_state (orig_dst -> lwtstate );
348358
349359 /* Check for insertion frequency (i.e., "k over n" insertions) */
350360 pkt_cnt = atomic_fetch_inc (& ilwt -> pkt_cnt );
351361 if (pkt_cnt % ilwt -> freq .n >= ilwt -> freq .k )
352362 goto out ;
353363
354364 local_bh_disable ();
355- cache_dst = dst_cache_get (& ilwt -> cache );
365+ dst = dst_cache_get (& ilwt -> cache );
356366 local_bh_enable ();
357367
368+ /* This is how we notify that the destination does not change after
369+ * transformation and that we need to use orig_dst instead of the cache
370+ */
371+ if (dst == & ilwt -> null_dst ) {
372+ dst_release (dst );
373+
374+ dst = orig_dst ;
375+ /* keep refcount balance: dst_release() is called at the end */
376+ dst_hold (dst );
377+ }
378+
358379 switch (ilwt -> mode ) {
359380 case IOAM6_IPTUNNEL_MODE_INLINE :
360381do_inline :
361382 /* Direct insertion - if there is no Hop-by-Hop yet */
362383 if (ipv6_hdr (skb )-> nexthdr == NEXTHDR_HOP )
363384 goto out ;
364385
365- err = ioam6_do_inline (net , skb , & ilwt -> tuninfo , cache_dst );
386+ err = ioam6_do_inline (net , skb , & ilwt -> tuninfo , dst );
366387 if (unlikely (err ))
367388 goto drop ;
368389
@@ -372,7 +393,7 @@ static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
372393 /* Encapsulation (ip6ip6) */
373394 err = ioam6_do_encap (net , skb , & ilwt -> tuninfo ,
374395 ilwt -> has_tunsrc , & ilwt -> tunsrc ,
375- & ilwt -> tundst , cache_dst );
396+ & ilwt -> tundst , dst );
376397 if (unlikely (err ))
377398 goto drop ;
378399
@@ -390,7 +411,7 @@ static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
390411 goto drop ;
391412 }
392413
393- if (unlikely (!cache_dst )) {
414+ if (unlikely (!dst )) {
394415 struct ipv6hdr * hdr = ipv6_hdr (skb );
395416 struct flowi6 fl6 ;
396417
@@ -401,43 +422,54 @@ static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
401422 fl6 .flowi6_mark = skb -> mark ;
402423 fl6 .flowi6_proto = hdr -> nexthdr ;
403424
404- cache_dst = ip6_route_output (net , NULL , & fl6 );
405- if (cache_dst -> error ) {
406- err = cache_dst -> error ;
425+ dst = ip6_route_output (net , NULL , & fl6 );
426+ if (dst -> error ) {
427+ err = dst -> error ;
407428 goto drop ;
408429 }
409430
410- /* cache only if we don't create a dst reference loop */
411- if (dst -> lwtstate != cache_dst -> lwtstate ) {
412- local_bh_disable ();
413- dst_cache_set_ip6 (& ilwt -> cache , cache_dst , & fl6 .saddr );
414- local_bh_enable ();
415- }
416-
417- err = skb_cow_head (skb , LL_RESERVED_SPACE (cache_dst -> dev ));
431+ /* If the destination is the same after transformation (which is
432+ * a valid use case for IOAM), then we don't want to add it to
433+ * the cache in order to avoid a reference loop. Instead, we add
434+ * our fake dst_entry to the cache as a way to detect this case.
435+ * Otherwise, we add the resolved destination to the cache.
436+ */
437+ local_bh_disable ();
438+ if (orig_dst -> lwtstate == dst -> lwtstate )
439+ dst_cache_set_ip6 (& ilwt -> cache ,
440+ & ilwt -> null_dst , & fl6 .saddr );
441+ else
442+ dst_cache_set_ip6 (& ilwt -> cache , dst , & fl6 .saddr );
443+ local_bh_enable ();
444+
445+ err = skb_cow_head (skb , LL_RESERVED_SPACE (dst -> dev ));
418446 if (unlikely (err ))
419447 goto drop ;
420448 }
421449
422450 /* avoid lwtunnel_output() reentry loop when destination is the same
423451 * after transformation (e.g., with the inline mode)
424452 */
425- if (dst -> lwtstate != cache_dst -> lwtstate ) {
453+ if (orig_dst -> lwtstate != dst -> lwtstate ) {
426454 skb_dst_drop (skb );
427- skb_dst_set (skb , cache_dst );
455+ skb_dst_set (skb , dst );
428456 return dst_output (net , sk , skb );
429457 }
430458out :
431- dst_release (cache_dst );
432- return dst -> lwtstate -> orig_output (net , sk , skb );
459+ dst_release (dst );
460+ return orig_dst -> lwtstate -> orig_output (net , sk , skb );
433461drop :
434- dst_release (cache_dst );
462+ dst_release (dst );
435463 kfree_skb (skb );
436464 return err ;
437465}
438466
439467static void ioam6_destroy_state (struct lwtunnel_state * lwt )
440468{
469+ /* Since the refcount of per-cpu dst_entry caches will never be 0 (see
470+ * why above) when our "fake" dst_entry is used, it is not necessary to
471+ * remove them before calling dst_cache_destroy()
472+ */
441473 dst_cache_destroy (& ioam6_lwt_state (lwt )-> cache );
442474}
443475
0 commit comments