From c7bc30c24f8625c6e9ef41be427fa26c6eb3d2bf Mon Sep 17 00:00:00 2001 From: Peter Holm Date: Sat, 17 Aug 2024 08:37:34 +0200 Subject: [PATCH 001/145] stress2: Some tests use hw.ncpu to scale the load. Tests on a box with a large number of CPUs show that this number needs to be capped --- tools/test/stress2/misc/buildkernel.sh | 1 + tools/test/stress2/misc/buildworld.sh | 1 + tools/test/stress2/misc/buildworld2.sh | 1 + tools/test/stress2/misc/buildworld3.sh | 1 + tools/test/stress2/misc/buildworld4.sh | 1 + tools/test/stress2/misc/crossmp3.sh | 1 + tools/test/stress2/misc/crossmp4.sh | 1 + tools/test/stress2/misc/crossmp5.sh | 1 + tools/test/stress2/misc/crossmp8.sh | 1 + tools/test/stress2/misc/gnop4.sh | 3 +++ tools/test/stress2/misc/tmpfs13.sh | 1 + tools/test/stress2/misc/zzbuildworld.sh | 1 + 12 files changed, 14 insertions(+) diff --git a/tools/test/stress2/misc/buildkernel.sh b/tools/test/stress2/misc/buildkernel.sh index 849a09b81439..e0aa85617f9b 100755 --- a/tools/test/stress2/misc/buildkernel.sh +++ b/tools/test/stress2/misc/buildkernel.sh @@ -49,6 +49,7 @@ chmod 0777 $TMPDIR log=$mntpoint/log p=$((`sysctl -n hw.ncpu`+ 1)) +[ $p -gt 32 ] && p=32 # Arbitrary cap p=`jot -r 1 1 $p` echo "make -j $p buildkernel KERNCONF=GENERIC DESTDIR=$mntpoint" \ "TARGET=amd64 TARGET_ARCH=amd64" diff --git a/tools/test/stress2/misc/buildworld.sh b/tools/test/stress2/misc/buildworld.sh index 595b387c90ae..3b362ec7041a 100755 --- a/tools/test/stress2/misc/buildworld.sh +++ b/tools/test/stress2/misc/buildworld.sh @@ -55,6 +55,7 @@ mkdir $TMPDIR chmod 0777 $TMPDIR p=$((`sysctl -n hw.ncpu`+ 1)) +[ $p -gt 32 ] && p=32 # Arbitrary cap timeout 20m make -i -j $p buildworld DESTDIR=$mntpoint TARGET=amd64 \ TARGET_ARCH=amd64 > /dev/null diff --git a/tools/test/stress2/misc/buildworld2.sh b/tools/test/stress2/misc/buildworld2.sh index 9c1eed97b7ea..3653cb1db5b4 100755 --- a/tools/test/stress2/misc/buildworld2.sh +++ b/tools/test/stress2/misc/buildworld2.sh @@ -46,6 +46,7 @@ mkdir $TMPDIR chmod 0777 $TMPDIR p=$((`sysctl -n hw.ncpu`+ 1)) +[ $p -gt 32 ] && p=32 # Arbitrary cap make -j $p buildworld DESTDIR=$mntpoint TARGET=amd64 TARGET_ARCH=amd64 \ > /dev/null & sleep $((20 * 60)) diff --git a/tools/test/stress2/misc/buildworld3.sh b/tools/test/stress2/misc/buildworld3.sh index 0c660cae8eae..e3bce2764c0c 100755 --- a/tools/test/stress2/misc/buildworld3.sh +++ b/tools/test/stress2/misc/buildworld3.sh @@ -62,6 +62,7 @@ mkdir $TMPDIR $MAKEOBJDIRPREFIX chmod 0777 $TMPDIR $MAKEOBJDIRPREFIX p=$((`sysctl -n hw.ncpu`+ 1)) +[ $p -gt 32 ] && p=32 # Arbitrary cap su $testuser -c \ "make -i -j $p buildworld DESTDIR=$mntpoint TARGET=amd64 \ TARGET_ARCH=amd64 > /dev/null" & diff --git a/tools/test/stress2/misc/buildworld4.sh b/tools/test/stress2/misc/buildworld4.sh index 6c15a72a9dcb..d1d162120952 100755 --- a/tools/test/stress2/misc/buildworld4.sh +++ b/tools/test/stress2/misc/buildworld4.sh @@ -50,6 +50,7 @@ mkdir $TMPDIR chmod 0777 $TMPDIR p=$((`sysctl -n hw.ncpu`+ 1)) +[ $p -gt 16 ] && p=16 # Arbitrary cap [ `sysctl -n vm.swap_total` -gt 0 ] && p=$((p * 4)) p=`jot -r 1 1 $p` echo "make -i -j $p buildworld DESTDIR=$mntpoint TARGET=amd64 "\ diff --git a/tools/test/stress2/misc/crossmp3.sh b/tools/test/stress2/misc/crossmp3.sh index 5eecb936e900..32c625a1e4ad 100755 --- a/tools/test/stress2/misc/crossmp3.sh +++ b/tools/test/stress2/misc/crossmp3.sh @@ -41,6 +41,7 @@ CONT=/tmp/crossmp3.continue if [ $# -eq 0 ]; then N=`sysctl -n hw.ncpu` + [ $N -gt 32 ] && N=32 # Arbitrary cap usermem=`sysctl -n hw.usermem` [ `sysctl -n vm.swap_total` -eq 0 ] && usermem=$((usermem / 2)) size=$((usermem / 1024 / 1024 / N)) diff --git a/tools/test/stress2/misc/crossmp4.sh b/tools/test/stress2/misc/crossmp4.sh index e22f969b72bb..21d22bee69e5 100755 --- a/tools/test/stress2/misc/crossmp4.sh +++ b/tools/test/stress2/misc/crossmp4.sh @@ -40,6 +40,7 @@ . ../default.cfg N=`sysctl -n hw.ncpu` +[ $N -gt 32 ] && N=32 # Arbitrary cap usermem=`sysctl -n hw.usermem` [ `swapinfo | wc -l` -eq 1 ] && usermem=$((usermem/100*80)) size=$((usermem / 1024 / 1024 - 2)) diff --git a/tools/test/stress2/misc/crossmp5.sh b/tools/test/stress2/misc/crossmp5.sh index 038dea7ebe4f..6e504d9f20ad 100755 --- a/tools/test/stress2/misc/crossmp5.sh +++ b/tools/test/stress2/misc/crossmp5.sh @@ -33,6 +33,7 @@ . ../default.cfg N=`sysctl -n hw.ncpu` +[ $N -gt 32 ] && N=32 # Arbitrary cap usermem=`sysctl -n hw.usermem` [ `swapinfo | wc -l` -eq 1 ] && usermem=$((usermem/100*80)) size=$((usermem / 1024 / 1024 / N)) diff --git a/tools/test/stress2/misc/crossmp8.sh b/tools/test/stress2/misc/crossmp8.sh index e877dfaf6d1c..eec5ba9bc7c1 100755 --- a/tools/test/stress2/misc/crossmp8.sh +++ b/tools/test/stress2/misc/crossmp8.sh @@ -41,6 +41,7 @@ CONT=/tmp/crossmp8.continue N=`sysctl -n hw.ncpu` +[ $N -gt 32 ] && N=32 # Arbitrary cap usermem=`sysctl -n hw.usermem` [ `swapinfo | wc -l` -eq 1 ] && usermem=$((usermem/100*80)) size=$((usermem / 1024 / 1024 / N)) diff --git a/tools/test/stress2/misc/gnop4.sh b/tools/test/stress2/misc/gnop4.sh index f938dd3b790b..1b4da74266f6 100755 --- a/tools/test/stress2/misc/gnop4.sh +++ b/tools/test/stress2/misc/gnop4.sh @@ -34,6 +34,8 @@ # https://people.freebsd.org/~pho/stress/log/kostik1017.txt # Fixed by r322175 +# Seen with p=513: Threads stuck in "ffsrca" + . ../default.cfg gigs=9 @@ -62,6 +64,7 @@ cd $mntpoint/src export MAKEOBJDIRPREFIX=$mntpoint/obj p=$((`sysctl -n hw.ncpu`+ 1)) +[ $p -gt 32 ] && p=32 # Temporary work around timeout 10m \ make -i -j $p buildworld DESTDIR=$mntpoint TARGET=amd64 \ TARGET_ARCH=amd64 > /dev/null diff --git a/tools/test/stress2/misc/tmpfs13.sh b/tools/test/stress2/misc/tmpfs13.sh index 29b44cbc9ad4..231c42033f9d 100755 --- a/tools/test/stress2/misc/tmpfs13.sh +++ b/tools/test/stress2/misc/tmpfs13.sh @@ -40,6 +40,7 @@ . ../default.cfg N=`sysctl -n hw.ncpu` +[ $N -gt 32 ] && N=32 # Arbitrary cap usermem=`sysctl -n hw.usermem` [ `swapinfo | wc -l` -eq 1 ] && usermem=$((usermem/100*80)) size=$((usermem / 1024 / 1024 / 2)) diff --git a/tools/test/stress2/misc/zzbuildworld.sh b/tools/test/stress2/misc/zzbuildworld.sh index 2104eb156c86..e1bf867d8d5f 100755 --- a/tools/test/stress2/misc/zzbuildworld.sh +++ b/tools/test/stress2/misc/zzbuildworld.sh @@ -44,6 +44,7 @@ top=$mntpoint export MAKEOBJDIRPREFIX=$top/obj export log=$top/buildworld.`date +%Y%m%dT%H%M` n=$((`sysctl -n hw.ncpu` + 1)) +[ $n -gt 32 ] && n=32 # Arbitrary cap cd $src make -j$n buildworld > $log 2>&1 && s=0 ||s=1 grep '\*\*\*' $log && s=2 From b5332809c633e7e37715f7823a8a8ee9799910a4 Mon Sep 17 00:00:00 2001 From: Navdeep Parhar Date: Mon, 22 Jul 2024 10:19:58 -0700 Subject: [PATCH 002/145] cxgbe/iw_cxgbe: Fix typo in assertion. eanbled -> enabled MFC after: 3 days --- sys/dev/cxgbe/iw_cxgbe/device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sys/dev/cxgbe/iw_cxgbe/device.c b/sys/dev/cxgbe/iw_cxgbe/device.c index 279bdb20d511..209d12767a1c 100644 --- a/sys/dev/cxgbe/iw_cxgbe/device.c +++ b/sys/dev/cxgbe/iw_cxgbe/device.c @@ -284,7 +284,7 @@ c4iw_activate(struct adapter *sc) } if (uld_active(sc, ULD_IWARP)) { - KASSERT(0, ("%s: RDMA already eanbled on sc %p", __func__, sc)); + KASSERT(0, ("%s: RDMA already enabled on sc %p", __func__, sc)); return (0); } From 0a9d1da6e6cede5e9c0ff63240d724049ad72b5b Mon Sep 17 00:00:00 2001 From: Navdeep Parhar Date: Wed, 31 Jul 2024 12:27:18 -0700 Subject: [PATCH 003/145] cxgbe(4): Stop work request queues in a reliable manner. Clear the EQ_HW_ALLOCATED flag with the wrq lock held and discard all work requests, pending or new, when it's not set. MFC after: 1 week Sponsored by: Chelsio Communications --- sys/dev/cxgbe/adapter.h | 5 ++++- sys/dev/cxgbe/t4_main.c | 20 +++++++++++++++++++- sys/dev/cxgbe/t4_sge.c | 15 ++++++++++++++- 3 files changed, 37 insertions(+), 3 deletions(-) diff --git a/sys/dev/cxgbe/adapter.h b/sys/dev/cxgbe/adapter.h index 0d731e736823..3922bd3909fe 100644 --- a/sys/dev/cxgbe/adapter.h +++ b/sys/dev/cxgbe/adapter.h @@ -1561,7 +1561,10 @@ t4_wrq_tx(struct adapter *sc, struct wrqe *wr) struct sge_wrq *wrq = wr->wrq; TXQ_LOCK(wrq); - t4_wrq_tx_locked(sc, wrq, wr); + if (__predict_true(wrq->eq.flags & EQ_HW_ALLOCATED)) + t4_wrq_tx_locked(sc, wrq, wr); + else + free(wr, M_CXGBE); TXQ_UNLOCK(wrq); } diff --git a/sys/dev/cxgbe/t4_main.c b/sys/dev/cxgbe/t4_main.c index 795c8d7e2e37..57c1eeceab22 100644 --- a/sys/dev/cxgbe/t4_main.c +++ b/sys/dev/cxgbe/t4_main.c @@ -2060,7 +2060,9 @@ stop_lld(struct adapter *sc) } #if defined(TCP_OFFLOAD) || defined(RATELIMIT) for_each_ofld_txq(vi, k, ofld_txq) { + TXQ_LOCK(&ofld_txq->wrq); ofld_txq->wrq.eq.flags &= ~EQ_HW_ALLOCATED; + TXQ_UNLOCK(&ofld_txq->wrq); } #endif for_each_rxq(vi, k, rxq) { @@ -2078,7 +2080,9 @@ stop_lld(struct adapter *sc) if (sc->flags & FULL_INIT_DONE) { /* Control queue */ wrq = &sc->sge.ctrlq[i]; + TXQ_LOCK(wrq); wrq->eq.flags &= ~EQ_HW_ALLOCATED; + TXQ_UNLOCK(wrq); quiesce_wrq(wrq); } } @@ -7047,8 +7051,22 @@ quiesce_txq(struct sge_txq *txq) static void quiesce_wrq(struct sge_wrq *wrq) { + struct wrqe *wr; - /* XXXTX */ + TXQ_LOCK(wrq); + while ((wr = STAILQ_FIRST(&wrq->wr_list)) != NULL) { + STAILQ_REMOVE_HEAD(&wrq->wr_list, link); +#ifdef INVARIANTS + wrq->nwr_pending--; + wrq->ndesc_needed -= howmany(wr->wr_len, EQ_ESIZE); +#endif + free(wr, M_CXGBE); + } + MPASS(wrq->nwr_pending == 0); + MPASS(wrq->ndesc_needed == 0); + wrq->nwr_pending = 0; + wrq->ndesc_needed = 0; + TXQ_UNLOCK(wrq); } static void diff --git a/sys/dev/cxgbe/t4_sge.c b/sys/dev/cxgbe/t4_sge.c index b4eb0701821a..bc81a0251deb 100644 --- a/sys/dev/cxgbe/t4_sge.c +++ b/sys/dev/cxgbe/t4_sge.c @@ -2921,6 +2921,10 @@ start_wrq_wr(struct sge_wrq *wrq, int len16, struct wrq_cookie *cookie) MPASS(ndesc > 0 && ndesc <= SGE_MAX_WR_NDESC); EQ_LOCK(eq); + if (__predict_false((eq->flags & EQ_HW_ALLOCATED) == 0)) { + EQ_UNLOCK(eq); + return (NULL); + } if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list)) drain_wrq_wr_list(sc, wrq); @@ -3016,7 +3020,10 @@ commit_wrq_wr(struct sge_wrq *wrq, void *w, struct wrq_cookie *cookie) F_FW_WR_EQUEQ); } - ring_eq_db(wrq->adapter, eq, ndesc); + if (__predict_true(eq->flags & EQ_HW_ALLOCATED)) + ring_eq_db(wrq->adapter, eq, ndesc); + else + IDXINCR(eq->dbidx, ndesc, eq->sidx); } else { MPASS(IDXDIFF(next->pidx, pidx, eq->sidx) == ndesc); next->pidx = pidx; @@ -3852,6 +3859,8 @@ alloc_ctrlq(struct adapter *sc, int idx) if (!(ctrlq->eq.flags & EQ_HW_ALLOCATED)) { MPASS(ctrlq->eq.flags & EQ_SW_ALLOCATED); + MPASS(ctrlq->nwr_pending == 0); + MPASS(ctrlq->ndesc_needed == 0); rc = alloc_eq_hwq(sc, NULL, &ctrlq->eq); if (rc != 0) { @@ -4554,6 +4563,7 @@ free_wrq(struct adapter *sc, struct sge_wrq *wrq) { free_eq(sc, &wrq->eq); MPASS(wrq->nwr_pending == 0); + MPASS(wrq->ndesc_needed == 0); MPASS(TAILQ_EMPTY(&wrq->incomplete_wrs)); MPASS(STAILQ_EMPTY(&wrq->wr_list)); bzero(wrq, sizeof(*wrq)); @@ -4848,6 +4858,9 @@ alloc_ofld_txq(struct vi_info *vi, struct sge_ofld_txq *ofld_txq, int idx) } if (!(eq->flags & EQ_HW_ALLOCATED)) { + MPASS(eq->flags & EQ_SW_ALLOCATED); + MPASS(ofld_txq->wrq.nwr_pending == 0); + MPASS(ofld_txq->wrq.ndesc_needed == 0); rc = alloc_eq_hwq(sc, vi, eq); if (rc != 0) { CH_ERR(vi, "failed to create hw ofld_txq%d: %d\n", idx, From 8132e959099f0c533f698d8fbc17386f9144432f Mon Sep 17 00:00:00 2001 From: Eugene Grosbein Date: Mon, 19 Aug 2024 10:34:37 +0700 Subject: [PATCH 004/145] libalias: fix subtle racy problem in outside-inside forwarding sys/netinet/libalias/alias_db.c has internal static function UseLink() that passes a link to CleanupLink() to verify if the link has expired. If so, UseLink() may return NULL. _FindLinkIn()'s usage of UseLink() is not quite correct. Assume there is "redirect_port udp" configured to forward incoming traffic for specific port to some internal address. Such a rule creates partially specified permanent link. After first such packet libalias creates new fully specifiled temporary LINK_UDP with default timeout 60 seconds. Also, in case of low traffic libalias may assign "timestamp" for this new temporary link way in the past because LibAliasTime is updated seldom and can keep old value for tens of seconds, and it will be used for the temporary link. It may happen that next incoming packet for redirected port passed to _FindLinkIn() results in a call to UseLink() that returns NULL due to detected expiration. Immediate return of NULL results in broken translation: either a packet is dropped (deny_incoming mode) or delivered to original destination address instead of internal one. Fix it with additional check for NULL to proceed with a search for original partially specified link. In case of UDP, it also recreates temporary fully specified link with a call to ReLink(). Practical examples are "redirect_port udp" rules for unidirectional SYSLOG protocol (port 514) or some low volume VPN encapsulated in UDP. Thanks to Peter Much for initial analysis and first version of a patch. Reported by: Peter Much PR: 269770 MFC after: 1 week --- sys/netinet/libalias/alias_db.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/sys/netinet/libalias/alias_db.c b/sys/netinet/libalias/alias_db.c index 167201fa1b8f..d516b6cda96c 100644 --- a/sys/netinet/libalias/alias_db.c +++ b/sys/netinet/libalias/alias_db.c @@ -868,8 +868,15 @@ _FindLinkIn(struct libalias *la, struct in_addr dst_addr, case 0: LIST_FOREACH(lnk, &grp->full, all.in) { if (lnk->dst_addr.s_addr == dst_addr.s_addr && - lnk->dst_port == dst_port) - return (UseLink(la, lnk)); + lnk->dst_port == dst_port) { + struct alias_link *found; + + found = UseLink(la, lnk); + if (found != NULL) + return (found); + /* link expired */ + break; + } } break; case LINK_UNKNOWN_DEST_PORT: From 308399a179a49b7b858c725de10177fdb0502fd2 Mon Sep 17 00:00:00 2001 From: Andre Albsmeier Date: Mon, 19 Aug 2024 10:54:24 +0300 Subject: [PATCH 005/145] tail -F: fix crash When show() detects an error and closes the file and follow() wants to close it again, a NULL dereference occurs. PR: 280910 MFC after: 1 week --- usr.bin/tail/forward.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/usr.bin/tail/forward.c b/usr.bin/tail/forward.c index a5303385a74f..6d9db94a827f 100644 --- a/usr.bin/tail/forward.c +++ b/usr.bin/tail/forward.c @@ -379,7 +379,8 @@ follow(file_info_t *files, enum STYLE style, off_t off) sb2.st_dev != file->st.st_dev || sb2.st_nlink == 0) { show(file); - fclose(file->fp); + if (file->fp != NULL) + fclose(file->fp); file->fp = ftmp; memcpy(&file->st, &sb2, sizeof(struct stat)); From 9ff2ebd92891b6953bbe44c6d6a6d8bc31f5139f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dag-Erling=20Sm=C3=B8rgrav?= Date: Mon, 19 Aug 2024 10:30:01 +0200 Subject: [PATCH 006/145] adduser: Better document ZFS dataset creation. MFC after: 3 days PR: 280873 Reviewed by: bcr Differential Revision: https://reviews.freebsd.org/D46316 --- usr.sbin/adduser/adduser.conf.5 | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/usr.sbin/adduser/adduser.conf.5 b/usr.sbin/adduser/adduser.conf.5 index 09b80f2df021..9663926ee341 100644 --- a/usr.sbin/adduser/adduser.conf.5 +++ b/usr.sbin/adduser/adduser.conf.5 @@ -23,7 +23,7 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" -.Dd April 12, 2007 +.Dd August 18, 2024 .Dt ADDUSER.CONF 5 .Os .Sh NAME @@ -171,8 +171,12 @@ The default information to be held in the GECOS field of .It Va uidstart The default user ID setting. This must be a number above 1000 and fewer than 65534. -.It Va Zflag -Do not attempt to create ZFS home dataset. +.It Va Zcreate +Set to +.Dq no +to prevent the creation of a ZFS home dataset if +.Va homeprefix +is a ZFS mountpoint. .El .Sh EXAMPLES The following is an example From 5048308bdb76f40e88c9133658fc61d82158ded2 Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Mon, 29 Jul 2024 11:28:15 +0100 Subject: [PATCH 007/145] buf_ring: Remove PREFETCH_DEFINED I'm not able to find anything in the tree that ever defined it. Remove as it's unused so is untested. Reviewed by: alc, imp, kib, markj Sponsored by: Arm Ltd Differential Revision: https://reviews.freebsd.org/D46148 --- sys/sys/buf_ring.h | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/sys/sys/buf_ring.h b/sys/sys/buf_ring.h index cb18175c3a75..6bd3b91dcd04 100644 --- a/sys/sys/buf_ring.h +++ b/sys/sys/buf_ring.h @@ -165,9 +165,6 @@ static __inline void * buf_ring_dequeue_sc(struct buf_ring *br) { uint32_t cons_head, cons_next; -#ifdef PREFETCH_DEFINED - uint32_t cons_next_next; -#endif uint32_t prod_tail; void *buf; @@ -204,20 +201,10 @@ buf_ring_dequeue_sc(struct buf_ring *br) prod_tail = atomic_load_acq_32(&br->br_prod_tail); cons_next = (cons_head + 1) & br->br_cons_mask; -#ifdef PREFETCH_DEFINED - cons_next_next = (cons_head + 2) & br->br_cons_mask; -#endif if (cons_head == prod_tail) return (NULL); -#ifdef PREFETCH_DEFINED - if (cons_next != prod_tail) { - prefetch(br->br_ring[cons_next]); - if (cons_next_next != prod_tail) - prefetch(br->br_ring[cons_next_next]); - } -#endif br->br_cons_head = cons_next; buf = br->br_ring[cons_head]; From d3d34d56bee4222b3bf3ec26d7877998405115a3 Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Mon, 29 Jul 2024 11:28:24 +0100 Subject: [PATCH 008/145] buf_ring: Support DEBUG_BUFRING in userspace The only part of DEBUG_BUFRING we don't support in userspace is the mutex checks. Add _KERNEL checks around these so we can enable the extra debugging. Reviewed by: alc, imp, kib, markj Sponsored by: Arm Ltd Differential Revision: https://reviews.freebsd.org/D46149 --- sys/sys/buf_ring.h | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/sys/sys/buf_ring.h b/sys/sys/buf_ring.h index 6bd3b91dcd04..2d61f393712d 100644 --- a/sys/sys/buf_ring.h +++ b/sys/sys/buf_ring.h @@ -35,13 +35,9 @@ #include #include -#ifdef DEBUG_BUFRING -#ifdef _KERNEL +#if defined(DEBUG_BUFRING) && defined(_KERNEL) #include #include -#else -#error "DEBUG_BUFRING is only supported in kernel" -#endif #endif struct buf_ring { @@ -54,7 +50,7 @@ struct buf_ring { volatile uint32_t br_cons_tail; int br_cons_size; int br_cons_mask; -#ifdef DEBUG_BUFRING +#if defined(DEBUG_BUFRING) && defined(_KERNEL) struct mtx *br_lock; #endif void *br_ring[0] __aligned(CACHE_LINE_SIZE); @@ -210,8 +206,10 @@ buf_ring_dequeue_sc(struct buf_ring *br) #ifdef DEBUG_BUFRING br->br_ring[cons_head] = NULL; +#ifdef _KERNEL if (!mtx_owned(br->br_lock)) panic("lock not held on single consumer dequeue"); +#endif if (br->br_cons_tail != cons_head) panic("inconsistent list cons_tail=%d cons_head=%d", br->br_cons_tail, cons_head); @@ -277,7 +275,7 @@ static __inline void * buf_ring_peek(struct buf_ring *br) { -#ifdef DEBUG_BUFRING +#if defined(DEBUG_BUFRING) && defined(_KERNEL) if ((br->br_lock != NULL) && !mtx_owned(br->br_lock)) panic("lock not held on single consumer dequeue"); #endif @@ -296,9 +294,9 @@ buf_ring_peek(struct buf_ring *br) static __inline void * buf_ring_peek_clear_sc(struct buf_ring *br) { -#ifdef DEBUG_BUFRING void *ret; +#if defined(DEBUG_BUFRING) && defined(_KERNEL) if (!mtx_owned(br->br_lock)) panic("lock not held on single consumer dequeue"); #endif @@ -320,17 +318,15 @@ buf_ring_peek_clear_sc(struct buf_ring *br) atomic_thread_fence_acq(); #endif + ret = br->br_ring[br->br_cons_head]; #ifdef DEBUG_BUFRING /* * Single consumer, i.e. cons_head will not move while we are * running, so atomic_swap_ptr() is not necessary here. */ - ret = br->br_ring[br->br_cons_head]; br->br_ring[br->br_cons_head] = NULL; - return (ret); -#else - return (br->br_ring[br->br_cons_head]); #endif + return (ret); } static __inline int From 17a597bc13aa59ee90facaf9b8dada80f32eb52d Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Mon, 29 Jul 2024 11:28:33 +0100 Subject: [PATCH 009/145] buf_ring: Consistently use atomic_*_32 We are operating on uint32_t values, use uint32_t atomic functions. Reviewed by: alc, imp, kib, markj Sponsored by: Arm Ltd Differential Revision: https://reviews.freebsd.org/D46150 --- sys/sys/buf_ring.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sys/sys/buf_ring.h b/sys/sys/buf_ring.h index 2d61f393712d..47ed04f570c1 100644 --- a/sys/sys/buf_ring.h +++ b/sys/sys/buf_ring.h @@ -94,7 +94,7 @@ buf_ring_enqueue(struct buf_ring *br, void *buf) } continue; } - } while (!atomic_cmpset_acq_int(&br->br_prod_head, prod_head, prod_next)); + } while (!atomic_cmpset_acq_32(&br->br_prod_head, prod_head, prod_next)); #ifdef DEBUG_BUFRING if (br->br_ring[prod_head] != NULL) panic("dangling value in enqueue"); @@ -108,7 +108,7 @@ buf_ring_enqueue(struct buf_ring *br, void *buf) */ while (br->br_prod_tail != prod_head) cpu_spinwait(); - atomic_store_rel_int(&br->br_prod_tail, prod_next); + atomic_store_rel_32(&br->br_prod_tail, prod_next); critical_exit(); return (0); } @@ -132,7 +132,7 @@ buf_ring_dequeue_mc(struct buf_ring *br) critical_exit(); return (NULL); } - } while (!atomic_cmpset_acq_int(&br->br_cons_head, cons_head, cons_next)); + } while (!atomic_cmpset_acq_32(&br->br_cons_head, cons_head, cons_next)); buf = br->br_ring[cons_head]; #ifdef DEBUG_BUFRING @@ -146,7 +146,7 @@ buf_ring_dequeue_mc(struct buf_ring *br) while (br->br_cons_tail != cons_head) cpu_spinwait(); - atomic_store_rel_int(&br->br_cons_tail, cons_next); + atomic_store_rel_32(&br->br_cons_tail, cons_next); critical_exit(); return (buf); From 3cc603909e09c958e20dd5a8a341f62f29e33a07 Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Mon, 19 Aug 2024 10:06:44 +0100 Subject: [PATCH 010/145] buf_ring: Keep the full head and tail values If a thread reads the head but then sleeps for long enough that another thread fills the ring and leaves the new head with the expected value then the cmpset can pass when it should have failed. To work around this keep the full head and tail value and use the upper bits as a generation count. Reviewed by: kib Sponsored by: Arm Ltd Differential Revision: https://reviews.freebsd.org/D46151 --- sys/sys/buf_ring.h | 87 +++++++++++++++++++++++++++++----------------- 1 file changed, 56 insertions(+), 31 deletions(-) diff --git a/sys/sys/buf_ring.h b/sys/sys/buf_ring.h index 47ed04f570c1..dec0f971ae44 100644 --- a/sys/sys/buf_ring.h +++ b/sys/sys/buf_ring.h @@ -40,6 +40,15 @@ #include #endif +/* + * We only apply the mask to the head and tail values when calculating the + * index into br_ring to access. This means the upper bits can be used as + * epoch to reduce the chance the atomic_cmpset succeedes when it should + * fail, e.g. when the head wraps while the CPU is in an interrupt. This + * is a probablistic fix as there is still a very unlikely chance the + * value wraps back to the expected value. + * + */ struct buf_ring { volatile uint32_t br_prod_head; volatile uint32_t br_prod_tail; @@ -63,28 +72,28 @@ struct buf_ring { static __inline int buf_ring_enqueue(struct buf_ring *br, void *buf) { - uint32_t prod_head, prod_next, cons_tail; -#ifdef DEBUG_BUFRING - int i; + uint32_t prod_head, prod_next, prod_idx; + uint32_t cons_tail, mask; + mask = br->br_prod_mask; +#ifdef DEBUG_BUFRING /* * Note: It is possible to encounter an mbuf that was removed * via drbr_peek(), and then re-added via drbr_putback() and * trigger a spurious panic. */ - for (i = br->br_cons_head; i != br->br_prod_head; - i = ((i + 1) & br->br_cons_mask)) - if (br->br_ring[i] == buf) + for (uint32_t i = br->br_cons_head; i != br->br_prod_head; i++) + if (br->br_ring[i & mask] == buf) panic("buf=%p already enqueue at %d prod=%d cons=%d", buf, i, br->br_prod_tail, br->br_cons_tail); #endif critical_enter(); do { prod_head = br->br_prod_head; - prod_next = (prod_head + 1) & br->br_prod_mask; + prod_next = prod_head + 1; cons_tail = br->br_cons_tail; - if (prod_next == cons_tail) { + if ((int32_t)(cons_tail + br->br_prod_size - prod_next) < 1) { rmb(); if (prod_head == br->br_prod_head && cons_tail == br->br_cons_tail) { @@ -95,11 +104,12 @@ buf_ring_enqueue(struct buf_ring *br, void *buf) continue; } } while (!atomic_cmpset_acq_32(&br->br_prod_head, prod_head, prod_next)); + prod_idx = prod_head & mask; #ifdef DEBUG_BUFRING - if (br->br_ring[prod_head] != NULL) + if (br->br_ring[prod_idx] != NULL) panic("dangling value in enqueue"); #endif - br->br_ring[prod_head] = buf; + br->br_ring[prod_idx] = buf; /* * If there are other enqueues in progress @@ -120,23 +130,26 @@ buf_ring_enqueue(struct buf_ring *br, void *buf) static __inline void * buf_ring_dequeue_mc(struct buf_ring *br) { - uint32_t cons_head, cons_next; + uint32_t cons_head, cons_next, cons_idx; + uint32_t mask; void *buf; critical_enter(); + mask = br->br_cons_mask; do { cons_head = br->br_cons_head; - cons_next = (cons_head + 1) & br->br_cons_mask; + cons_next = cons_head + 1; if (cons_head == br->br_prod_tail) { critical_exit(); return (NULL); } } while (!atomic_cmpset_acq_32(&br->br_cons_head, cons_head, cons_next)); + cons_idx = cons_head & mask; - buf = br->br_ring[cons_head]; + buf = br->br_ring[cons_idx]; #ifdef DEBUG_BUFRING - br->br_ring[cons_head] = NULL; + br->br_ring[cons_idx] = NULL; #endif /* * If there are other dequeues in progress @@ -160,8 +173,8 @@ buf_ring_dequeue_mc(struct buf_ring *br) static __inline void * buf_ring_dequeue_sc(struct buf_ring *br) { - uint32_t cons_head, cons_next; - uint32_t prod_tail; + uint32_t cons_head, cons_next, cons_idx; + uint32_t prod_tail, mask; void *buf; /* @@ -189,6 +202,7 @@ buf_ring_dequeue_sc(struct buf_ring *br) * * <1> Load (on core 1) from br->br_ring[cons_head] can be reordered (speculative readed) by CPU. */ + mask = br->br_cons_mask; #if defined(__arm__) || defined(__aarch64__) cons_head = atomic_load_acq_32(&br->br_cons_head); #else @@ -196,16 +210,17 @@ buf_ring_dequeue_sc(struct buf_ring *br) #endif prod_tail = atomic_load_acq_32(&br->br_prod_tail); - cons_next = (cons_head + 1) & br->br_cons_mask; + cons_next = cons_head + 1; - if (cons_head == prod_tail) + if (cons_head == prod_tail) return (NULL); + cons_idx = cons_head & mask; br->br_cons_head = cons_next; - buf = br->br_ring[cons_head]; + buf = br->br_ring[cons_idx]; #ifdef DEBUG_BUFRING - br->br_ring[cons_head] = NULL; + br->br_ring[cons_idx] = NULL; #ifdef _KERNEL if (!mtx_owned(br->br_lock)) panic("lock not held on single consumer dequeue"); @@ -226,18 +241,21 @@ buf_ring_dequeue_sc(struct buf_ring *br) static __inline void buf_ring_advance_sc(struct buf_ring *br) { - uint32_t cons_head, cons_next; - uint32_t prod_tail; + uint32_t cons_head, cons_next, prod_tail; +#ifdef DEBUG_BUFRING + uint32_t mask; + mask = br->br_cons_mask; +#endif cons_head = br->br_cons_head; prod_tail = br->br_prod_tail; - cons_next = (cons_head + 1) & br->br_cons_mask; - if (cons_head == prod_tail) + cons_next = cons_head + 1; + if (cons_head == prod_tail) return; br->br_cons_head = cons_next; #ifdef DEBUG_BUFRING - br->br_ring[cons_head] = NULL; + br->br_ring[cons_head & mask] = NULL; #endif br->br_cons_tail = cons_next; } @@ -261,9 +279,12 @@ buf_ring_advance_sc(struct buf_ring *br) static __inline void buf_ring_putback_sc(struct buf_ring *br, void *new) { - KASSERT(br->br_cons_head != br->br_prod_tail, + uint32_t mask; + + mask = br->br_cons_mask; + KASSERT((br->br_cons_head & mask) != (br->br_prod_tail & mask), ("Buf-Ring has none in putback")) ; - br->br_ring[br->br_cons_head] = new; + br->br_ring[br->br_cons_head & mask] = new; } /* @@ -274,11 +295,13 @@ buf_ring_putback_sc(struct buf_ring *br, void *new) static __inline void * buf_ring_peek(struct buf_ring *br) { + uint32_t mask; #if defined(DEBUG_BUFRING) && defined(_KERNEL) if ((br->br_lock != NULL) && !mtx_owned(br->br_lock)) panic("lock not held on single consumer dequeue"); #endif + mask = br->br_cons_mask; /* * I believe it is safe to not have a memory barrier * here because we control cons and tail is worst case @@ -288,12 +311,13 @@ buf_ring_peek(struct buf_ring *br) if (br->br_cons_head == br->br_prod_tail) return (NULL); - return (br->br_ring[br->br_cons_head]); + return (br->br_ring[br->br_cons_head & mask]); } static __inline void * buf_ring_peek_clear_sc(struct buf_ring *br) { + uint32_t mask; void *ret; #if defined(DEBUG_BUFRING) && defined(_KERNEL) @@ -301,6 +325,7 @@ buf_ring_peek_clear_sc(struct buf_ring *br) panic("lock not held on single consumer dequeue"); #endif + mask = br->br_cons_mask; if (br->br_cons_head == br->br_prod_tail) return (NULL); @@ -318,13 +343,13 @@ buf_ring_peek_clear_sc(struct buf_ring *br) atomic_thread_fence_acq(); #endif - ret = br->br_ring[br->br_cons_head]; + ret = br->br_ring[br->br_cons_head & mask]; #ifdef DEBUG_BUFRING /* * Single consumer, i.e. cons_head will not move while we are * running, so atomic_swap_ptr() is not necessary here. */ - br->br_ring[br->br_cons_head] = NULL; + br->br_ring[br->br_cons_head & mask] = NULL; #endif return (ret); } @@ -333,7 +358,7 @@ static __inline int buf_ring_full(struct buf_ring *br) { - return (((br->br_prod_head + 1) & br->br_prod_mask) == br->br_cons_tail); + return (br->br_prod_head == br->br_cons_tail + br->br_cons_size - 1); } static __inline int From 44e1cfca417c5ef0db908f3836ec3ba704ef1de2 Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Mon, 19 Aug 2024 10:06:52 +0100 Subject: [PATCH 011/145] buf_ring: Use atomic operations with br_cons_tail Use an atomic operation with a memory barrier loading br_cons_tail from the producer thread and storing to it in the consumer thread. On dequeue we need to read the pointer value from the buf_ring before moving the consumer tail as that indicates the entry is available to be used. The store release atomic operation guarantees this. In the enqueueing thread we then need to use a load acquire atomic operation to ensure writing to this entry can only happen after the tail has been read and checked. Reported by: Ali Saidi Co-developed by: Ali Saidi Reviewed by: markj Sponsored by: Arm Ltd Differential Revision: https://reviews.freebsd.org/D46152 --- sys/sys/buf_ring.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sys/sys/buf_ring.h b/sys/sys/buf_ring.h index dec0f971ae44..9aeb5aa19fa8 100644 --- a/sys/sys/buf_ring.h +++ b/sys/sys/buf_ring.h @@ -91,7 +91,7 @@ buf_ring_enqueue(struct buf_ring *br, void *buf) do { prod_head = br->br_prod_head; prod_next = prod_head + 1; - cons_tail = br->br_cons_tail; + cons_tail = atomic_load_acq_32(&br->br_cons_tail); if ((int32_t)(cons_tail + br->br_prod_size - prod_next) < 1) { rmb(); @@ -229,7 +229,7 @@ buf_ring_dequeue_sc(struct buf_ring *br) panic("inconsistent list cons_tail=%d cons_head=%d", br->br_cons_tail, cons_head); #endif - br->br_cons_tail = cons_next; + atomic_store_rel_32(&br->br_cons_tail, cons_next); return (buf); } @@ -257,7 +257,7 @@ buf_ring_advance_sc(struct buf_ring *br) #ifdef DEBUG_BUFRING br->br_ring[cons_head & mask] = NULL; #endif - br->br_cons_tail = cons_next; + atomic_store_rel_32(&br->br_cons_tail, cons_next); } /* From 7eb0fffc7792bc294d6b86546a3923e2e2f83f8a Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Mon, 19 Aug 2024 10:07:10 +0100 Subject: [PATCH 012/145] buf_ring: Remove old arm-only dequeue code In the single consumer dequeue the consumer thread controls br_cons_head. As such no ordering between this and other data are required. Reviewed by: alc, imp, kib, markj Sponsored by: Arm Ltd Differential Revision: https://reviews.freebsd.org/D46153 --- sys/sys/buf_ring.h | 29 ----------------------------- 1 file changed, 29 deletions(-) diff --git a/sys/sys/buf_ring.h b/sys/sys/buf_ring.h index 9aeb5aa19fa8..ad463fba27cb 100644 --- a/sys/sys/buf_ring.h +++ b/sys/sys/buf_ring.h @@ -177,37 +177,8 @@ buf_ring_dequeue_sc(struct buf_ring *br) uint32_t prod_tail, mask; void *buf; - /* - * This is a workaround to allow using buf_ring on ARM and ARM64. - * ARM64TODO: Fix buf_ring in a generic way. - * REMARKS: It is suspected that br_cons_head does not require - * load_acq operation, but this change was extensively tested - * and confirmed it's working. To be reviewed once again in - * FreeBSD-12. - * - * Preventing following situation: - - * Core(0) - buf_ring_enqueue() Core(1) - buf_ring_dequeue_sc() - * ----------------------------------------- ---------------------------------------------- - * - * cons_head = br->br_cons_head; - * atomic_cmpset_acq_32(&br->br_prod_head, ...)); - * buf = br->br_ring[cons_head]; > - * br->br_ring[prod_head] = buf; - * atomic_store_rel_32(&br->br_prod_tail, ...); - * prod_tail = br->br_prod_tail; - * if (cons_head == prod_tail) - * return (NULL); - * ` - * - * <1> Load (on core 1) from br->br_ring[cons_head] can be reordered (speculative readed) by CPU. - */ mask = br->br_cons_mask; -#if defined(__arm__) || defined(__aarch64__) - cons_head = atomic_load_acq_32(&br->br_cons_head); -#else cons_head = br->br_cons_head; -#endif prod_tail = atomic_load_acq_32(&br->br_prod_tail); cons_next = cons_head + 1; From 947754af55edc217c10e3456d97558c4eb6d0f99 Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Mon, 19 Aug 2024 10:07:19 +0100 Subject: [PATCH 013/145] buf_ring: Use atomic operations with br_prod_tail As with br_cons_tail use an atomic load acquire to read br_prod_tail in buf_ring_dequeue_mc and buf_ring_peek*. On dequeue we need to ensure we don't read the entry from the buf_ring until it is available and prod_tail has updated. There is already an appropriate store in the enqueue path and an appropriate load in the single consumer dequeue, we just need one in the other functions that read from the buf_ring. Reviewed by: imp, markj Sponsored by: Arm Ltd Differential Revision: https://reviews.freebsd.org/D46154 --- sys/sys/buf_ring.h | 45 ++++++++++++++++----------------------------- 1 file changed, 16 insertions(+), 29 deletions(-) diff --git a/sys/sys/buf_ring.h b/sys/sys/buf_ring.h index ad463fba27cb..66e1e55bc5e9 100644 --- a/sys/sys/buf_ring.h +++ b/sys/sys/buf_ring.h @@ -131,7 +131,7 @@ static __inline void * buf_ring_dequeue_mc(struct buf_ring *br) { uint32_t cons_head, cons_next, cons_idx; - uint32_t mask; + uint32_t prod_tail, mask; void *buf; critical_enter(); @@ -139,8 +139,9 @@ buf_ring_dequeue_mc(struct buf_ring *br) do { cons_head = br->br_cons_head; cons_next = cons_head + 1; + prod_tail = atomic_load_acq_32(&br->br_prod_tail); - if (cons_head == br->br_prod_tail) { + if (cons_head == prod_tail) { critical_exit(); return (NULL); } @@ -266,29 +267,26 @@ buf_ring_putback_sc(struct buf_ring *br, void *new) static __inline void * buf_ring_peek(struct buf_ring *br) { - uint32_t mask; + uint32_t cons_head, prod_tail, mask; #if defined(DEBUG_BUFRING) && defined(_KERNEL) if ((br->br_lock != NULL) && !mtx_owned(br->br_lock)) panic("lock not held on single consumer dequeue"); #endif mask = br->br_cons_mask; - /* - * I believe it is safe to not have a memory barrier - * here because we control cons and tail is worst case - * a lagging indicator so we worst case we might - * return NULL immediately after a buffer has been enqueued - */ - if (br->br_cons_head == br->br_prod_tail) + prod_tail = atomic_load_acq_32(&br->br_prod_tail); + cons_head = br->br_cons_head; + + if (cons_head == prod_tail) return (NULL); - return (br->br_ring[br->br_cons_head & mask]); + return (br->br_ring[cons_head & mask]); } static __inline void * buf_ring_peek_clear_sc(struct buf_ring *br) { - uint32_t mask; + uint32_t cons_head, prod_tail, mask; void *ret; #if defined(DEBUG_BUFRING) && defined(_KERNEL) @@ -297,30 +295,19 @@ buf_ring_peek_clear_sc(struct buf_ring *br) #endif mask = br->br_cons_mask; - if (br->br_cons_head == br->br_prod_tail) - return (NULL); + prod_tail = atomic_load_acq_32(&br->br_prod_tail); + cons_head = br->br_cons_head; -#if defined(__arm__) || defined(__aarch64__) - /* - * The barrier is required there on ARM and ARM64 to ensure, that - * br->br_ring[br->br_cons_head] will not be fetched before the above - * condition is checked. - * Without the barrier, it is possible, that buffer will be fetched - * before the enqueue will put mbuf into br, then, in the meantime, the - * enqueue will update the array and the br_prod_tail, and the - * conditional check will be true, so we will return previously fetched - * (and invalid) buffer. - */ - atomic_thread_fence_acq(); -#endif + if (cons_head == prod_tail) + return (NULL); - ret = br->br_ring[br->br_cons_head & mask]; + ret = br->br_ring[cons_head & mask]; #ifdef DEBUG_BUFRING /* * Single consumer, i.e. cons_head will not move while we are * running, so atomic_swap_ptr() is not necessary here. */ - br->br_ring[br->br_cons_head & mask] = NULL; + br->br_ring[cons_head & mask] = NULL; #endif return (ret); } From fe2445f47d027c73aa7266669e7d94b70d3949a4 Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Mon, 19 Aug 2024 10:07:26 +0100 Subject: [PATCH 014/145] buf_ring: Ensure correct ordering of loads When enqueueing on an architecture with a weak memory model ensure loading br->br_prod_head and br->br_cons_tail are ordered correctly. If br_cons_tail is loaded first then other threads may perform a dequeue and enqueue before br_prod_head is loaded. This will mean the tail is one less than it should be and the code under the prod_next == cons_tail check could incorrectly be skipped. buf_ring_dequeue_mc has the same issue with br->br_prod_tail and br->br_cons_head so needs the same fix. Reported by: Ali Saidi Co-developed by: Ali Saidi Reviewed by: imp, kib, markj Sponsored by: Arm Ltd Differential Revision: https://reviews.freebsd.org/D46155 --- sys/sys/buf_ring.h | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/sys/sys/buf_ring.h b/sys/sys/buf_ring.h index 66e1e55bc5e9..512f20dc13e2 100644 --- a/sys/sys/buf_ring.h +++ b/sys/sys/buf_ring.h @@ -89,7 +89,17 @@ buf_ring_enqueue(struct buf_ring *br, void *buf) #endif critical_enter(); do { - prod_head = br->br_prod_head; + /* + * br->br_prod_head needs to be read before br->br_cons_tail. + * If not then we could perform the dequeue and enqueue + * between reading br_cons_tail and reading br_prod_head. This + * could give us values where br_cons_head == br_prod_tail + * (after masking). + * + * To work around this us a load acquire. This is just to + * ensure ordering within this thread. + */ + prod_head = atomic_load_acq_32(&br->br_prod_head); prod_next = prod_head + 1; cons_tail = atomic_load_acq_32(&br->br_cons_tail); @@ -137,7 +147,12 @@ buf_ring_dequeue_mc(struct buf_ring *br) critical_enter(); mask = br->br_cons_mask; do { - cons_head = br->br_cons_head; + /* + * As with buf_ring_enqueue ensure we read the head before + * the tail. If we read them in the wrong order we may + * think the bug_ring is full when it is empty. + */ + cons_head = atomic_load_acq_32(&br->br_cons_head); cons_next = cons_head + 1; prod_tail = atomic_load_acq_32(&br->br_prod_tail); From 87940d2b331c2575f96fbff2600d8da35191157b Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Mon, 19 Aug 2024 10:08:20 +0100 Subject: [PATCH 015/145] buf_ring: Add an Arm copyright I've change enough of this file to add Arm as a copyright holder. Add it after the "All rights reserved" line as that's not needed. Reviewed by: imp Sponsored by: Arm Ltd Differential Revision: https://reviews.freebsd.org/D46157 --- sys/sys/buf_ring.h | 1 + 1 file changed, 1 insertion(+) diff --git a/sys/sys/buf_ring.h b/sys/sys/buf_ring.h index 512f20dc13e2..c99cf81d8b6d 100644 --- a/sys/sys/buf_ring.h +++ b/sys/sys/buf_ring.h @@ -3,6 +3,7 @@ * * Copyright (c) 2007-2009 Kip Macy * All rights reserved. + * Copyright (c) 2024 Arm Ltd * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions From 7a345763f96d86eee9ab578e64311bf452e58900 Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Tue, 30 Jul 2024 10:26:21 +0100 Subject: [PATCH 016/145] arm64: Expand the use of Armv8.1-A atomics When targeting Armv8.1 we can assume FEAT_LSE is available and can use the atomic instructions this provides without needing to check for support first. Reviewed by: imp, markj, emaste Sponsored by: Arm Ltd Differential Revision: https://reviews.freebsd.org/D46159 --- sys/arm64/include/atomic.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sys/arm64/include/atomic.h b/sys/arm64/include/atomic.h index 76ca951678d4..c7aaa76d9de9 100644 --- a/sys/arm64/include/atomic.h +++ b/sys/arm64/include/atomic.h @@ -65,8 +65,9 @@ extern _Bool lse_supported; #include -#ifdef _KERNEL - +#if defined(__ARM_FEATURE_ATOMICS) +#define _ATOMIC_LSE_SUPPORTED 1 +#elif defined(_KERNEL) #ifdef LSE_ATOMICS #define _ATOMIC_LSE_SUPPORTED 1 #else From 43caa2e805c28a236e6624aedd91591d7018fce5 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Mon, 19 Aug 2024 13:55:47 +0000 Subject: [PATCH 017/145] bhyve: Make boot ROM handling more consistent - On amd64, deprecate lpc.bootrom and lpc.bootvars. Use top-level config variables instead. - Introduce a generic predicate which can be used to determine whether the guest has a boot ROM. Reviewed by: corvink, jhb MFC after: 2 weeks Sponsored by: Innovate UK Differential Revision: https://reviews.freebsd.org/D46282 --- usr.sbin/bhyve/amd64/bhyverun_machdep.c | 20 ++++++++++++++++++-- usr.sbin/bhyve/amd64/ioapic.c | 3 ++- usr.sbin/bhyve/amd64/pci_irq.c | 3 ++- usr.sbin/bhyve/amd64/pci_lpc.c | 19 ++----------------- usr.sbin/bhyve/amd64/pci_lpc.h | 1 - usr.sbin/bhyve/bhyve_config.5 | 24 ++++++++++++------------ usr.sbin/bhyve/bhyverun.c | 7 +------ usr.sbin/bhyve/bootrom.c | 17 +++++++++++++---- usr.sbin/bhyve/bootrom.h | 3 ++- 9 files changed, 52 insertions(+), 45 deletions(-) diff --git a/usr.sbin/bhyve/amd64/bhyverun_machdep.c b/usr.sbin/bhyve/amd64/bhyverun_machdep.c index c453092107d5..d51ad3a5fc05 100644 --- a/usr.sbin/bhyve/amd64/bhyverun_machdep.c +++ b/usr.sbin/bhyve/amd64/bhyverun_machdep.c @@ -37,6 +37,7 @@ #include "acpi.h" #include "atkbdc.h" #include "bhyverun.h" +#include "bootrom.h" #include "config.h" #include "debug.h" #include "e820.h" @@ -241,6 +242,18 @@ bhyve_optparse(int argc, char **argv) bhyve_usage(1); } } + + /* Handle backwards compatibility aliases in config options. */ + if (get_config_value("lpc.bootrom") != NULL && + get_config_value("bootrom") == NULL) { + warnx("lpc.bootrom is deprecated, use '-o bootrom' instead"); + set_config_value("bootrom", get_config_value("lpc.bootrom")); + } + if (get_config_value("lpc.bootvars") != NULL && + get_config_value("bootvars") == NULL) { + warnx("lpc.bootvars is deprecated, use '-o bootvars' instead"); + set_config_value("bootvars", get_config_value("lpc.bootvars")); + } } void @@ -291,7 +304,7 @@ bhyve_start_vcpu(struct vcpu *vcpu, bool bsp) int error; if (bsp) { - if (lpc_bootrom()) { + if (bootrom_boot()) { error = vm_set_capability(vcpu, VM_CAP_UNRESTRICTED_GUEST, 1); if (error != 0) { @@ -332,6 +345,9 @@ bhyve_init_platform(struct vmctx *ctx, struct vcpu *bsp __unused) rtc_init(ctx); sci_init(ctx); error = e820_init(ctx); + if (error != 0) + return (error); + error = bootrom_loadrom(ctx); if (error != 0) return (error); @@ -355,7 +371,7 @@ bhyve_init_platform_late(struct vmctx *ctx, struct vcpu *bsp __unused) if (error != 0) return (error); - if (lpc_bootrom() && strcmp(lpc_fwcfg(), "bhyve") == 0) + if (bootrom_boot() && strcmp(lpc_fwcfg(), "bhyve") == 0) fwctl_init(); if (get_config_bool("acpi_tables")) { diff --git a/usr.sbin/bhyve/amd64/ioapic.c b/usr.sbin/bhyve/amd64/ioapic.c index 9ad1c501fbae..494fb0c7ae82 100644 --- a/usr.sbin/bhyve/amd64/ioapic.c +++ b/usr.sbin/bhyve/amd64/ioapic.c @@ -33,6 +33,7 @@ #include #include +#include "bootrom.h" #include "ioapic.h" #include "pci_emul.h" #include "pci_lpc.h" @@ -72,7 +73,7 @@ ioapic_pci_alloc_irq(struct pci_devinst *pi) if (pci_pins == 0) return (-1); - if (lpc_bootrom()) { + if (bootrom_boot()) { /* For external bootrom use fixed mapping. */ return (16 + (4 + pi->pi_slot + pi->pi_lintr.pin) % 8); } diff --git a/usr.sbin/bhyve/amd64/pci_irq.c b/usr.sbin/bhyve/amd64/pci_irq.c index 7e1aee7fbb1d..fea6d9a2591c 100644 --- a/usr.sbin/bhyve/amd64/pci_irq.c +++ b/usr.sbin/bhyve/amd64/pci_irq.c @@ -38,6 +38,7 @@ #include #include "acpi.h" +#include "bootrom.h" #include "inout.h" #include "ioapic.h" #include "pci_emul.h" @@ -205,7 +206,7 @@ pirq_alloc_pin(struct pci_devinst *pi) pirq_cold = 0; - if (lpc_bootrom()) { + if (bootrom_boot()) { /* For external bootrom use fixed mapping. */ best_pin = (4 + pi->pi_slot + pi->pi_lintr.pin) % 8; } else { diff --git a/usr.sbin/bhyve/amd64/pci_lpc.c b/usr.sbin/bhyve/amd64/pci_lpc.c index 57d2333edcc6..ed41a800a2ea 100644 --- a/usr.sbin/bhyve/amd64/pci_lpc.c +++ b/usr.sbin/bhyve/amd64/pci_lpc.c @@ -104,7 +104,7 @@ lpc_device_parse(const char *opts) if (romfile == NULL) { errx(4, "invalid bootrom option \"%s\"", opts); } - set_config_value("lpc.bootrom", romfile); + set_config_value("bootrom", romfile); varfile = strsep(&str, ","); if (varfile == NULL) { @@ -112,7 +112,7 @@ lpc_device_parse(const char *opts) goto done; } if (strchr(varfile, '=') == NULL) { - set_config_value("lpc.bootvars", varfile); + set_config_value("bootvars", varfile); } else { /* varfile doesn't exist, it's another config * option */ @@ -182,13 +182,6 @@ lpc_print_supported_devices(void) printf("%s\n", pctestdev_getname()); } -const char * -lpc_bootrom(void) -{ - - return (get_config_value("lpc.bootrom")); -} - const char * lpc_fwcfg(void) { @@ -256,14 +249,6 @@ lpc_init(struct vmctx *ctx) const char *backend, *name; char *node_name; int unit, error; - const nvlist_t *nvl; - - nvl = find_config_node("lpc"); - if (nvl != NULL && nvlist_exists(nvl, "bootrom")) { - error = bootrom_loadrom(ctx, nvl); - if (error) - return (error); - } /* COM1 and COM2 */ for (unit = 0; unit < LPC_UART_NUM; unit++) { diff --git a/usr.sbin/bhyve/amd64/pci_lpc.h b/usr.sbin/bhyve/amd64/pci_lpc.h index 2dca8f7bec24..402eae082545 100644 --- a/usr.sbin/bhyve/amd64/pci_lpc.h +++ b/usr.sbin/bhyve/amd64/pci_lpc.h @@ -69,7 +69,6 @@ int lpc_device_parse(const char *opt); void lpc_print_supported_devices(void); char *lpc_pirq_name(int pin); void lpc_pirq_routed(void); -const char *lpc_bootrom(void); const char *lpc_fwcfg(void); #endif diff --git a/usr.sbin/bhyve/bhyve_config.5 b/usr.sbin/bhyve/bhyve_config.5 index d0e5c8ae47d3..ebbb206cca9f 100644 --- a/usr.sbin/bhyve/bhyve_config.5 +++ b/usr.sbin/bhyve/bhyve_config.5 @@ -23,7 +23,7 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" -.Dd November 20, 2023 +.Dd August 13, 2024 .Dt BHYVE_CONFIG 5 .Os .Sh NAME @@ -120,6 +120,17 @@ The value must be formatted as described in .Xr expand_number 3 . .It Va memory.wired Ta bool Ta false Ta Wire guest memory. +.It Va bootrom Ta path Ta Ta +Path to a boot ROM. +During initialization of the guest, the contents of this file are copied into +the guest's memory. +If a boot ROM is present, a firmware interface device is +also enabled for use by the boot ROM. +.It Va bootvars Ta path Ta Ta +Path to boot VARS. +The contents of this file are copied beneath the boot ROM. +Firmware can write to it to save variables. +All variables will be persistent even on reboots of the guest. .It Va acpi_tables Ta bool Ta true Ta Generate ACPI tables. .It Va acpi_tables_in_memory Ta bool Ta true Ta @@ -550,17 +561,6 @@ The following nodes are available under .Va lpc : .Bl -column "pc-testdev" "Format" "Default" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description -.It Va bootrom Ta path Ta Ta -Path to a boot ROM. -The contents of this file are copied into the guest's -memory ending just before the 4GB physical address. -If a boot ROM is present, a firmware interface device is -also enabled for use by the boot ROM. -.It Va bootvars Ta path Ta Ta -Path to boot VARS. -The contents of this file are copied beneath the boot ROM. -Firmware can write to it to save variables. -All variables will be persistent even on reboots of the guest. .It Va com1 Ta node Ta Ta Settings for the COM1 serial port device. .It Va com2 Ta node Ta Ta diff --git a/usr.sbin/bhyve/bhyverun.c b/usr.sbin/bhyve/bhyverun.c index f844da90e76c..41655a188bf9 100644 --- a/usr.sbin/bhyve/bhyverun.c +++ b/usr.sbin/bhyve/bhyverun.c @@ -525,12 +525,7 @@ do_open(const char *vmname) reinit = false; -#ifdef __amd64__ - romboot = lpc_bootrom() != NULL; -#else - romboot = true; -#endif - + romboot = bootrom_boot(); error = vm_create(vmname); if (error) { if (errno == EEXIST) { diff --git a/usr.sbin/bhyve/bootrom.c b/usr.sbin/bhyve/bootrom.c index 1d461ba76597..e4adaca55947 100644 --- a/usr.sbin/bhyve/bootrom.c +++ b/usr.sbin/bhyve/bootrom.c @@ -192,7 +192,7 @@ bootrom_alloc(struct vmctx *ctx, size_t len, int prot, int flags, } int -bootrom_loadrom(struct vmctx *ctx, const nvlist_t *nvl) +bootrom_loadrom(struct vmctx *ctx) { struct stat sbuf; ssize_t rlen; @@ -204,9 +204,9 @@ bootrom_loadrom(struct vmctx *ctx, const nvlist_t *nvl) rv = -1; varfd = -1; - bootrom = get_config_value_node(nvl, "bootrom"); + bootrom = get_config_value("bootrom"); if (bootrom == NULL) { - return (-1); + return (0); } /* @@ -235,7 +235,7 @@ bootrom_loadrom(struct vmctx *ctx, const nvlist_t *nvl) rom_size = sbuf.st_size; - varfile = get_config_value_node(nvl, "bootvars"); + varfile = get_config_value("bootvars"); var_size = 0; if (varfile != NULL) { varfd = open(varfile, O_RDWR); @@ -314,3 +314,12 @@ bootrom_loadrom(struct vmctx *ctx, const nvlist_t *nvl) free(romfile); return (rv); } + +/* + * Are we relying on a bootrom to initialize the guest's CPU context? + */ +bool +bootrom_boot(void) +{ + return (get_config_value("bootrom") != NULL); +} diff --git a/usr.sbin/bhyve/bootrom.h b/usr.sbin/bhyve/bootrom.h index d22ac3718fa2..0477b0f35218 100644 --- a/usr.sbin/bhyve/bootrom.h +++ b/usr.sbin/bhyve/bootrom.h @@ -45,6 +45,7 @@ enum { }; int bootrom_alloc(struct vmctx *ctx, size_t len, int prot, int flags, char **region_out, uint64_t *gpa_out); -int bootrom_loadrom(struct vmctx *ctx, const nvlist_t *nvl); +bool bootrom_boot(void); +int bootrom_loadrom(struct vmctx *ctx); #endif From e962b37bf0ffe7f30f5b025b46ea49ba01c71f2f Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Mon, 19 Aug 2024 13:56:06 +0000 Subject: [PATCH 018/145] bhyve: Do not enable PCI BAR decoding if a boot ROM is present Let the boot ROM handle BAR initialization. This fixes a problem where u-boot's BAR remapping conflicts with some limitations in bhyve. See https://lists.freebsd.org/archives/freebsd-virtualization/2024-April/002103.html for a description of what goes wrong. The old behaviour can be restored by setting the pci.enable_bars configuration variable. Reviewed by: corvink, jhb Sponsored by: Innovate UK Differential Revision: https://reviews.freebsd.org/D45049 --- usr.sbin/bhyve/bhyve_config.5 | 3 +++ usr.sbin/bhyve/pci_emul.c | 27 ++++++++++++++++++++++++--- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/usr.sbin/bhyve/bhyve_config.5 b/usr.sbin/bhyve/bhyve_config.5 index ebbb206cca9f..25185e2ef1b4 100644 --- a/usr.sbin/bhyve/bhyve_config.5 +++ b/usr.sbin/bhyve/bhyve_config.5 @@ -157,6 +157,9 @@ Specify the keyboard layout name with the file name in This value only works when loaded with UEFI mode for VNC, and used a VNC client that don't support QEMU Extended Key Event Message (e.g. TightVNC). +.It Va pci.enable_bars Ta bool Ta Ta +Enable and map PCI BARs before executing any guest code. +This setting is false by default when using a boot ROM and true otherwise. .It Va tpm.path Ta string Ta Ta Path to the host TPM device. This is typically /dev/tpm0. diff --git a/usr.sbin/bhyve/pci_emul.c b/usr.sbin/bhyve/pci_emul.c index 00e9138d3910..e066d6766f3c 100644 --- a/usr.sbin/bhyve/pci_emul.c +++ b/usr.sbin/bhyve/pci_emul.c @@ -48,6 +48,7 @@ #include "acpi.h" #include "bhyverun.h" +#include "bootrom.h" #include "config.h" #include "debug.h" #ifdef __amd64__ @@ -853,6 +854,14 @@ pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, enum pcibar_type type, TAILQ_INSERT_BEFORE(bar, new_bar, chain); } + /* + * Enable PCI BARs only if we don't have a boot ROM, i.e., bhyveload was + * used to load the initial guest image. Otherwise, we rely on the boot + * ROM to handle this. + */ + if (!get_config_bool_default("pci.enable_bars", !bootrom_boot())) + return (0); + /* * pci_passthru devices synchronize their physical and virtual command * register on init. For that reason, the virtual cmd reg should be @@ -966,8 +975,19 @@ pci_emul_assign_bar(struct pci_devinst *const pdi, const int idx, pci_set_cfgdata32(pdi, PCIR_BAR(idx + 1), bar >> 32); } - if (type != PCIBAR_ROM) { - register_bar(pdi, idx); + switch (type) { + case PCIBAR_IO: + if (porten(pdi)) + register_bar(pdi, idx); + break; + case PCIBAR_MEM32: + case PCIBAR_MEM64: + case PCIBAR_MEMHI64: + if (memen(pdi)) + register_bar(pdi, idx); + break; + default: + break; } return (0); @@ -1140,7 +1160,8 @@ pci_emul_init(struct vmctx *ctx, struct pci_devemu *pde, int bus, int slot, pci_set_cfgdata8(pdi, PCIR_INTLINE, 255); pci_set_cfgdata8(pdi, PCIR_INTPIN, 0); - pci_set_cfgdata8(pdi, PCIR_COMMAND, PCIM_CMD_BUSMASTEREN); + if (!get_config_bool_default("pci.enable_bars", !bootrom_boot())) + pci_set_cfgdata8(pdi, PCIR_COMMAND, PCIM_CMD_BUSMASTEREN); err = (*pde->pe_init)(pdi, fi->fi_config); if (err == 0) From f66e71fa78e16164339f7fd4791306fb30165581 Mon Sep 17 00:00:00 2001 From: Ed Maste Date: Mon, 19 Aug 2024 09:43:37 -0400 Subject: [PATCH 019/145] linux.4: clarify path translation Try to be a little more explicit about the path translation mechanism accessing /compat/linux/ then falling back to /. PR: 277804 Reviewed by: fernape Sponsored by: The FreeBSD Foundation --- share/man/man4/linux.4 | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/share/man/man4/linux.4 b/share/man/man4/linux.4 index 212dd2526f3f..b404c9e1c04d 100644 --- a/share/man/man4/linux.4 +++ b/share/man/man4/linux.4 @@ -63,9 +63,11 @@ before .Pa / . For example, when Linux process attempts to open .Pa /etc/passwd , -it will really access +it will first access .Pa /compat/linux/etc/passwd , -unless the latter does not exist. +falling back to +.Pa /etc/passwd +if the former does not exist. This is used to make sure Linux processes load Linux shared libraries instead of their similarly-named FreeBSD counterparts, and also to provide alternative versions of certain other files and virtual From d1daec3d358eb5aaa38fa7c95fbfa330c46a69a1 Mon Sep 17 00:00:00 2001 From: Ed Maste Date: Mon, 19 Aug 2024 10:26:26 -0400 Subject: [PATCH 020/145] linux.4: improve the path translation clarificiation As suggested by martin@lispworks.com, refer to the compat path explicitly, and correct an existing grammaro. PR: 277804 Fixes: f66e71fa78e1 ("linux.4: clarify path translation") Sponsored by: The FreeBSD Foundation --- share/man/man4/linux.4 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/share/man/man4/linux.4 b/share/man/man4/linux.4 index b404c9e1c04d..711ac11e8fce 100644 --- a/share/man/man4/linux.4 +++ b/share/man/man4/linux.4 @@ -61,13 +61,13 @@ under .Pa /compat/linux ) before .Pa / . -For example, when Linux process attempts to open +For example, when a Linux process attempts to open .Pa /etc/passwd , it will first access .Pa /compat/linux/etc/passwd , falling back to .Pa /etc/passwd -if the former does not exist. +if the compat path does not exist. This is used to make sure Linux processes load Linux shared libraries instead of their similarly-named FreeBSD counterparts, and also to provide alternative versions of certain other files and virtual From 1d26746cfd4a19e9d641ecd2a21e764743a7bc03 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Mon, 19 Aug 2024 14:07:37 +0000 Subject: [PATCH 021/145] build.7: Document the packages target Reviewed by: manu, emaste MFC after: 1 week Sponsored by: Innovate UK Differential Revision: https://reviews.freebsd.org/D46286 --- share/man/man7/build.7 | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/share/man/man7/build.7 b/share/man/man7/build.7 index 3c71c14e6039..afafef9d3c02 100644 --- a/share/man/man7/build.7 +++ b/share/man/man7/build.7 @@ -401,6 +401,19 @@ Install the kernel to the directory .Pa ${DISTDIR}/kernel/boot/kernel . This target is used while building a release; see .Xr release 7 . +.It Cm packages +Create a +.Xr pkg 7 +repository containing packages that can be used to create or upgrade an +installation of the base system. +The output repository is placed in the object directory, under +.Pa repo/${PKG_ABI} +where +.Va PKG_ABI +is the +.Xr pkg 7 +ABI for the build target, for example, +.Pa /usr/obj/${SRCDIR}/repo/FreeBSD:15:amd64 . .It Cm packagekernel Archive the results of .Cm distributekernel , From b118b6eb4cb7520eb348a6ac965b077fc5179fde Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Mon, 19 Aug 2024 14:07:51 +0000 Subject: [PATCH 022/145] pkgbase: Unify pkg ABI handling for pkgbase targets Right now, to get the pkg ABI we either use PKG_ABI, derived from newvers.sh, or use an ABI file from the staged world. This inconsistency is confusing and can cause problems. Switch to a single source of truth: use an ABI file from the worldstage dir to get the ABI of pkgbase packages. In particular, we do not need to know the ABI until staging is done. More specifically: - use a shell command to define PKG_ABI, - replace inline uses of ABI_FILE, - run sign-packages in a subshell (this was already done for the update-packages target) so that the staging targets are done before we try to evaluate the ABI. Reviewed by: manu MFC after: 1 month Sponsored by: Innovate UK Differential Revision: https://reviews.freebsd.org/D46287 --- Makefile.inc1 | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/Makefile.inc1 b/Makefile.inc1 index 19ed923702b1..62296134f7d8 100644 --- a/Makefile.inc1 +++ b/Makefile.inc1 @@ -1994,6 +1994,10 @@ REPODIR?= ${OBJROOT}repo PKG_FORMAT?= tzst PKG_REPO_SIGNING_KEY?= # empty PKG_OUTPUT_DIR?= ${PKG_VERSION} +PKG_ABI_FILE?= ${WSTAGEDIR}/usr/bin/uname +.if make(create-*-packages*) || make(real-update-packages) || make(real-sign-packages) +PKG_ABI!= ${PKG_CMD} -o ABI_FILE=${PKG_ABI_FILE} config ABI +.endif .ORDER: stage-packages create-packages .ORDER: create-packages create-world-packages @@ -2006,12 +2010,6 @@ _pkgbootstrap: .PHONY @env ASSUME_ALWAYS_YES=YES pkg bootstrap .endif -# -# Determine PKG_ABI from newvers.sh if not already set. -# -.if !defined(PKG_ABI) && (make(create-world-packages-jobs) || make(create-kernel-packages*) || make(real-update-packages) || make (create-source-packages) || make(sign-packages)) -PKG_ABI=${_TYPE}:${MAJOR_REVISION}:${TARGET_ARCH} -.endif PKG_BIN_VERSION!=${PKG_CMD} --version /dev/null |\ awk -F. '/^[0-9.]+$$/ {print $$1 * 10000 + $$2 * 100 + $$3}' .if ${PKG_BIN_VERSION} < 11700 @@ -2021,8 +2019,7 @@ PKG_EXT= pkg .endif .if !defined(PKG_VERSION_FROM) && make(real-update-packages) -.if defined(PKG_ABI) -.if exists(${REPODIR}/${PKG_ABI}) +.if exists(${PKG_ABI_FILE}) && exists(${REPODIR}/${PKG_ABI}) PKG_VERSION_FROM!=/usr/bin/readlink ${REPODIR}/${PKG_ABI}/latest PKG_VERSION_FROM_DIR= ${REPODIR}/${PKG_ABI}/${PKG_VERSION_FROM} BRANCH_EXT_FROM= ${PKG_VERSION_FROM:C/.*([[:alpha:]][^\.]*).*/\1/} @@ -2032,7 +2029,6 @@ PKG_VERSION_FROM_DIR= BRANCH_EXT_FROM= .endif .endif -.endif PKGMAKEARGS+= PKG_VERSION=${PKG_VERSION} \ NO_INSTALLEXTRAKERNELS=${NO_INSTALLEXTRAKERNELS} @@ -2204,7 +2200,7 @@ create-world-package-${pkgname}: .PHONY @if [ "${pkgname}" == "runtime" ]; then \ sed -i '' -e "s/%VCS_REVISION%/${VCS_REVISION}/" ${WSTAGEDIR}/${pkgname}.ucl ; \ fi - ${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/usr/bin/uname -o ALLOW_BASE_SHLIBS=yes \ + ${PKG_CMD} -o ABI=${PKG_ABI} -o ALLOW_BASE_SHLIBS=yes \ -o OSVERSION="${SRCRELDATE}" \ create -f ${PKG_FORMAT} -M ${WSTAGEDIR}/${pkgname}.ucl \ -p ${WSTAGEDIR}/${pkgname}.plist \ @@ -2306,7 +2302,7 @@ create-kernel-packages-extra-flavor${flavor:C,^""$,${_default_flavor},}-${_kerne /name/ { printf("===> Creating %s-", $$2); next } \ /version/ {print $$2; next } ' \ ${KSTAGEDIR}/kernel.${_kernel}/kernel.${_kernel}${flavor}.ucl ; \ - ${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/usr/bin/uname -o ALLOW_BASE_SHLIBS=yes \ + ${PKG_CMD} -o ABI=${PKG_ABI} -o ALLOW_BASE_SHLIBS=yes \ -o OSVERSION="${SRCRELDATE}" \ create -f ${PKG_FORMAT} \ -M ${KSTAGEDIR}/kernel.${_kernel}/kernel.${_kernel}${flavor}.ucl \ @@ -2318,20 +2314,22 @@ create-kernel-packages-extra-flavor${flavor:C,^""$,${_default_flavor},}-${_kerne . endfor .endif -sign-packages: _pkgbootstrap .PHONY +sign-packages: .PHONY + ${_+_}@cd ${.CURDIR}; \ + ${MAKE} -f Makefile.inc1 PKG_VERSION=${PKG_VERSION} real-sign-packages + +real-sign-packages: _pkgbootstrap .PHONY printf "version = 2;\n" > ${WSTAGEDIR}/meta .if ${PKG_BIN_VERSION} < 11700 printf "packing_format = \"${PKG_FORMAT}\";\n" >> ${WSTAGEDIR}/meta .endif - @[ -L "${REPODIR}/$$(${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/usr/bin/uname config ABI)/latest" ] && \ - unlink ${REPODIR}/$$(${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/usr/bin/uname config ABI)/latest ; \ - ${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/usr/bin/uname repo \ - -o OSVERSION="${SRCRELDATE}" \ + @[ -L "${REPODIR}/${PKG_ABI}/latest" ] && unlink ${REPODIR}/${PKG_ABI}/latest; \ + ${PKG_CMD} -o ABI=${PKG_ABI} repo -o OSVERSION="${SRCRELDATE}" \ -m ${WSTAGEDIR}/meta \ - -o ${REPODIR}/$$(${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/usr/bin/uname config ABI)/${PKG_VERSION} \ - ${REPODIR}/$$(${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/usr/bin/uname config ABI)/${PKG_VERSION} \ + -o ${REPODIR}/${PKG_ABI}/${PKG_VERSION} \ + ${REPODIR}/${PKG_ABI}/${PKG_VERSION} \ ${PKG_REPO_SIGNING_KEY} ; \ - cd ${REPODIR}/$$(${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/usr/bin/uname config ABI); \ + cd ${REPODIR}/${PKG_ABI}; \ ln -s ${PKG_OUTPUT_DIR} latest # From d7d5c9efef031aa11a841d1836f7c937dcbe4ec8 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Mon, 19 Aug 2024 14:08:05 +0000 Subject: [PATCH 023/145] pkgbase: Let source packages be built in parallel To build the packages target, we build src and src-sys packages containing the source code from which the repo was built. These packages take significantly longer than the others, presumably because they contain many more files. Because both source packages are built to satisfy the same target, they end up being built serially. Split them into separate subtargets so that they can run in parallel. This saves a couple of minutes on my build machine. Reviewed by: manu, emaste MFC after: 1 month Sponsored by: Innovate UK Differential Revision: https://reviews.freebsd.org/D46288 --- Makefile.inc1 | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/Makefile.inc1 b/Makefile.inc1 index 62296134f7d8..4df7f1900cf4 100644 --- a/Makefile.inc1 +++ b/Makefile.inc1 @@ -2119,17 +2119,13 @@ create-packages-source: _pkgbootstrap _repodir .PHONY create-packages: .PHONY create-packages-world create-packages-kernel create-packages-source -create-source-packages: _pkgbootstrap .PHONY +create-source-src-package: _pkgbootstrap .PHONY rm -f ${SSTAGEDIR}/*.plist 2>/dev/null || : .if !empty(GIT_CMD) && exists(${GIT_CMD}) && exists(${SRCDIR}/.git) @cd ${SRCDIR}; \ ( echo "@override_prefix /usr/src" ; \ ${GIT_CMD} ls-files --recurse-submodules ":!:sys/" ) \ > ${SSTAGEDIR}/src.plist - @cd ${SRCDIR}; \ - ( echo "@override_prefix /usr/src" ; \ - ${GIT_CMD} ls-files --recurse-submodules "sys/" ) \ - > ${SSTAGEDIR}/src-sys.plist ${SRCDIR}/release/packages/generate-ucl.lua \ PKGNAME "src" \ PKGGENNAME "src" \ @@ -2141,6 +2137,22 @@ create-source-packages: _pkgbootstrap .PHONY PKG_WWW "${PKG_WWW}" \ ${SRCDIR}/release/packages/template.ucl \ ${SSTAGEDIR}/src.ucl + ${PKG_CMD} -o ABI=${PKG_ABI} \ + -o OSVERSION="${SRCRELDATE}" \ + create -f ${PKG_FORMAT} \ + -M ${SSTAGEDIR}/src.ucl \ + -p ${SSTAGEDIR}/src.plist \ + -r ${SRCDIR} \ + -o ${REPODIR}/${PKG_ABI}/${PKG_OUTPUT_DIR} +.endif + +create-source-src-sys-package: _pkgbootstrap .PHONY + rm -f ${SSTAGEDIR}/*.plist 2>/dev/null || : +.if !empty(GIT_CMD) && exists(${GIT_CMD}) && exists(${SRCDIR}/.git) + @cd ${SRCDIR}; \ + ( echo "@override_prefix /usr/src" ; \ + ${GIT_CMD} ls-files --recurse-submodules "sys/" ) \ + > ${SSTAGEDIR}/src-sys.plist ${SRCDIR}/release/packages/generate-ucl.lua \ PKGNAME "src-sys" \ PKGGENNAME "src" \ @@ -2152,13 +2164,6 @@ create-source-packages: _pkgbootstrap .PHONY PKG_WWW "${PKG_WWW}" \ ${SRCDIR}/release/packages/template.ucl \ ${SSTAGEDIR}/src-sys.ucl - ${PKG_CMD} -o ABI=${PKG_ABI} \ - -o OSVERSION="${SRCRELDATE}" \ - create -f ${PKG_FORMAT} \ - -M ${SSTAGEDIR}/src.ucl \ - -p ${SSTAGEDIR}/src.plist \ - -r ${SRCDIR} \ - -o ${REPODIR}/${PKG_ABI}/${PKG_OUTPUT_DIR} ${PKG_CMD} -o ABI=${PKG_ABI} \ -o OSVERSION="${SRCRELDATE}" \ create -f ${PKG_FORMAT} \ @@ -2168,6 +2173,8 @@ create-source-packages: _pkgbootstrap .PHONY -o ${REPODIR}/${PKG_ABI}/${PKG_OUTPUT_DIR} .endif +create-source-packages: .PHONY _pkgbootstrap create-source-src-package create-source-src-sys-package + create-world-packages: _pkgbootstrap .PHONY @rm -f ${WSTAGEDIR}/*.plist 2>/dev/null || : @cd ${WSTAGEDIR} ; \ From 9897a66923a3e79c22fcbd4bc80afae9eb9f277c Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Mon, 19 Aug 2024 14:08:44 +0000 Subject: [PATCH 024/145] pf: Let rdr rules modify the src port if doing so would avoid a conflict If NAT rules cause inbound connections to different external IPs to be mapped to the same internal IP, and some application uses the same source port for multiple such connections, rdr translation may result in conflicts that cause some of the connections to be dropped. Address this by letting rdr rules detect state conflicts and modulate the source port to avoid them. Reviewed by: kp, allanjude MFC after: 3 months Sponsored by: Klara, Inc. Sponsored by: Modirum Differential Revision: https://reviews.freebsd.org/D44488 --- share/man/man5/pf.conf.5 | 9 ++- sys/netpfil/pf/pf_lb.c | 70 +++++++++++++++++-- tests/sys/netpfil/pf/Makefile | 1 + tests/sys/netpfil/pf/rdr-srcport.py | 20 ++++++ tests/sys/netpfil/pf/rdr.sh | 100 ++++++++++++++++++++++++++++ 5 files changed, 191 insertions(+), 9 deletions(-) create mode 100644 tests/sys/netpfil/pf/rdr-srcport.py diff --git a/share/man/man5/pf.conf.5 b/share/man/man5/pf.conf.5 index da55f00293bb..f04b0799741e 100644 --- a/share/man/man5/pf.conf.5 +++ b/share/man/man5/pf.conf.5 @@ -27,7 +27,7 @@ .\" ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE .\" POSSIBILITY OF SUCH DAMAGE. .\" -.Dd June 6, 2024 +.Dd June 24, 2024 .Dt PF.CONF 5 .Os .Sh NAME @@ -1400,9 +1400,14 @@ or .Xr udp 4 connections; implicitly in the case of .Ar nat -rules and explicitly in the case of +rules and both implicitly and explicitly in the case of .Ar rdr rules. +A +.Ar rdr +rule may cause the source port to be modified if doing so avoids a conflict +with an existing connection. +A random source port in the range 50001-65535 is chosen in this case. Port numbers are never translated with a .Ar binat rule. diff --git a/sys/netpfil/pf/pf_lb.c b/sys/netpfil/pf/pf_lb.c index 4fcad7e578a8..4b703d3d02da 100644 --- a/sys/netpfil/pf/pf_lb.c +++ b/sys/netpfil/pf/pf_lb.c @@ -600,7 +600,7 @@ pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off, { struct pf_krule *r = NULL; struct pf_addr *naddr; - uint16_t *nport; + uint16_t *nportp; uint16_t low, high; PF_RULES_RASSERT(); @@ -643,9 +643,8 @@ pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off, return (NULL); } - /* XXX We only modify one side for now. */ naddr = &(*nkp)->addr[1]; - nport = &(*nkp)->port[1]; + nportp = &(*nkp)->port[1]; switch (r->action) { case PF_NAT: @@ -658,7 +657,7 @@ pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off, } if (r->rpool.mape.offset > 0) { if (pf_get_mape_sport(pd->af, pd->proto, r, saddr, - sport, daddr, dport, naddr, nport, sn)) { + sport, daddr, dport, naddr, nportp, sn)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: MAP-E port allocation (%u/%u/%u)" " failed\n", @@ -668,7 +667,7 @@ pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off, goto notrans; } } else if (pf_get_sport(pd->af, pd->proto, r, saddr, sport, - daddr, dport, naddr, nport, low, high, sn)) { + daddr, dport, naddr, nportp, low, high, sn)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: NAT proxy port allocation (%u-%u) failed\n", r->rpool.proxy_port[0], r->rpool.proxy_port[1])); @@ -742,6 +741,9 @@ pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off, } break; case PF_RDR: { + struct pf_state_key_cmp key; + uint16_t cut, low, high, nport; + if (pf_map_addr(pd->af, r, saddr, naddr, NULL, NULL, sn)) goto notrans; if ((r->rpool.opts & PF_POOL_TYPEMASK) == PF_POOL_BITMASK) @@ -762,9 +764,63 @@ pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off, /* Wrap around if necessary. */ if (tmp_nport > 65535) tmp_nport -= 65535; - *nport = htons((uint16_t)tmp_nport); + nport = htons((uint16_t)tmp_nport); } else if (r->rpool.proxy_port[0]) - *nport = htons(r->rpool.proxy_port[0]); + nport = htons(r->rpool.proxy_port[0]); + else + nport = dport; + + /* + * Update the destination port. + */ + *nportp = nport; + + /* + * Do we have a source port conflict in the stack state? Try to + * modulate the source port if so. Note that this is racy since + * the state lookup may not find any matches here but will once + * pf_create_state() actually instantiates the state. + */ + bzero(&key, sizeof(key)); + key.af = pd->af; + key.proto = pd->proto; + key.port[0] = sport; + PF_ACPY(&key.addr[0], saddr, key.af); + key.port[1] = nport; + PF_ACPY(&key.addr[1], naddr, key.af); + + if (!pf_find_state_all_exists(&key, PF_OUT)) + break; + + low = 50001; /* XXX-MJ PF_NAT_PROXY_PORT_LOW/HIGH */ + high = 65535; + cut = arc4random() % (1 + high - low) + low; + for (uint32_t tmp = cut; + tmp <= high && tmp <= UINT16_MAX; tmp++) { + key.port[0] = htons(tmp); + if (!pf_find_state_all_exists(&key, PF_OUT)) { + /* Update the source port. */ + (*nkp)->port[0] = htons(tmp); + goto out; + } + } + for (uint32_t tmp = cut - 1; tmp >= low; tmp--) { + key.port[0] = htons(tmp); + if (!pf_find_state_all_exists(&key, PF_OUT)) { + /* Update the source port. */ + (*nkp)->port[0] = htons(tmp); + goto out; + } + } + + DPFPRINTF(PF_DEBUG_MISC, + ("pf: RDR source port allocation failed\n")); + if (0) { +out: + DPFPRINTF(PF_DEBUG_MISC, + ("pf: RDR source port allocation %u->%u\n", + ntohs(sport), ntohs((*nkp)->port[0]))); + } break; } default: diff --git a/tests/sys/netpfil/pf/Makefile b/tests/sys/netpfil/pf/Makefile index 4a16642a967b..2b3cb9fbd858 100644 --- a/tests/sys/netpfil/pf/Makefile +++ b/tests/sys/netpfil/pf/Makefile @@ -73,6 +73,7 @@ ${PACKAGE}FILES+= CVE-2019-5597.py \ pfsync_defer.py \ pft_ether.py \ pft_read_ipfix.py \ + rdr-srcport.py \ utils.subr ${PACKAGE}FILESMODE_CVE-2019-5597.py= 0555 diff --git a/tests/sys/netpfil/pf/rdr-srcport.py b/tests/sys/netpfil/pf/rdr-srcport.py new file mode 100644 index 000000000000..633580582711 --- /dev/null +++ b/tests/sys/netpfil/pf/rdr-srcport.py @@ -0,0 +1,20 @@ +# +# A helper script which accepts TCP connections and writes the remote port +# number to the stream. +# + +import socket + +def main(): + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + s.bind(('0.0.0.0', 8888)) + s.listen(5) + + while True: + cs, addr = s.accept() + cs.sendall(str(addr[1]).encode()) + cs.close() + +if __name__ == '__main__': + main() diff --git a/tests/sys/netpfil/pf/rdr.sh b/tests/sys/netpfil/pf/rdr.sh index b7ec80b4d85e..135bfd42c1f4 100644 --- a/tests/sys/netpfil/pf/rdr.sh +++ b/tests/sys/netpfil/pf/rdr.sh @@ -121,7 +121,107 @@ tcp_v6_cleanup() pft_cleanup } + +atf_test_case "srcport" "cleanup" +srcport_head() +{ + atf_set descr 'TCP rdr srcport modulation' + atf_set require.user root + atf_set require.progs python3 + atf_set timeout 9999 +} + +# +# Test that rdr works for multiple TCP with same srcip and srcport. +# +# Four jails, a, b, c, d, are used: +# - jail d runs a server on port 8888, +# - jail a makes connections to the server, routed through jails b and c, +# - jail b uses NAT to rewrite source addresses and ports to the same 2-tuple, +# avoiding the need to use SO_REUSEADDR in jail a, +# - jail c uses a redirect rule to map the destination address to the same +# address and port, resulting in a NAT state conflict. +# +# In this case, the rdr rule should also rewrite the source port (again) to +# resolve the state conflict. +# +srcport_body() +{ + pft_init + + j="rdr:srcport" + epair1=$(vnet_mkepair) + epair2=$(vnet_mkepair) + epair3=$(vnet_mkepair) + + echo $epair_one + echo $epair_two + + vnet_mkjail ${j}a ${epair1}a + vnet_mkjail ${j}b ${epair1}b ${epair2}a + vnet_mkjail ${j}c ${epair2}b ${epair3}a + vnet_mkjail ${j}d ${epair3}b + + # configure addresses for a + jexec ${j}a ifconfig lo0 up + jexec ${j}a ifconfig ${epair1}a inet 198.51.100.50/24 up + jexec ${j}a ifconfig ${epair1}a inet alias 198.51.100.51/24 + jexec ${j}a ifconfig ${epair1}a inet alias 198.51.100.52/24 + + # configure addresses for b + jexec ${j}b ifconfig lo0 up + jexec ${j}b ifconfig ${epair1}b inet 198.51.100.1/24 up + jexec ${j}b ifconfig ${epair2}a inet 198.51.101.2/24 up + + # configure addresses for c + jexec ${j}c ifconfig lo0 up + jexec ${j}c ifconfig ${epair2}b inet 198.51.101.3/24 up + jexec ${j}c ifconfig ${epair2}b inet alias 198.51.101.4/24 + jexec ${j}c ifconfig ${epair2}b inet alias 198.51.101.5/24 + jexec ${j}c ifconfig ${epair3}a inet 203.0.113.1/24 up + + # configure addresses for d + jexec ${j}d ifconfig lo0 up + jexec ${j}d ifconfig ${epair3}b inet 203.0.113.50/24 up + + jexec ${j}b sysctl net.inet.ip.forwarding=1 + jexec ${j}c sysctl net.inet.ip.forwarding=1 + jexec ${j}b pfctl -e + jexec ${j}c pfctl -e + + pft_set_rules ${j}b \ + "set debug misc" \ + "nat on ${epair2}a inet from 198.51.100.0/24 to any -> ${epair2}a static-port" + + pft_set_rules ${j}c \ + "set debug misc" \ + "rdr on ${epair2}b proto tcp from any to ${epair2}b port 7777 -> 203.0.113.50 port 8888" + + jexec ${j}a route add default 198.51.100.1 + jexec ${j}c route add 198.51.100.0/24 198.51.101.2 + jexec ${j}d route add 198.51.101.0/24 203.0.113.1 + + jexec ${j}d python3 $(atf_get_srcdir)/rdr-srcport.py & + sleep 1 + + echo a | jexec ${j}a nc -w 3 -s 198.51.100.50 -p 1234 198.51.101.3 7777 > port1 + + jexec ${j}a nc -s 198.51.100.51 -p 1234 198.51.101.4 7777 > port2 & + jexec ${j}a nc -s 198.51.100.52 -p 1234 198.51.101.5 7777 > port3 & + sleep 1 + + atf_check -o inline:"1234" cat port1 + atf_check -o match:"[0-9]+" -o not-inline:"1234" cat port2 + atf_check -o match:"[0-9]+" -o not-inline:"1234" cat port3 +} + +srcport_cleanup() +{ + pft_cleanup +} + atf_init_test_cases() { atf_add_test_case "tcp_v6" + atf_add_test_case "srcport" } From 7e65cfc9bbe5a9d735ef38f7ed49965b234b8a20 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Mon, 19 Aug 2024 14:14:30 +0000 Subject: [PATCH 025/145] pf: Make pf_get_translation() more expressive Currently pf_get_translation() returns a pointer to a matching nat/rdr/binat rule, or NULL if no rule was matched or an error occurred while applying the translation. That is, we don't distinguish between errors and the lack of a matching rule. This, if an error (e.g., a memory allocation failure or a state conflict) occurs, we simply handle the packet as if no translation rule was present. This is not desireable. Make pf_get_translation() return the matching rule as an out-param and instead return a reason code which indicates whether there was no translation rule, or there was a translation rule and we failed to apply it, or there was a translation rule and we applied it successfully. Reviewed by: kp, allanjude MFC after: 3 months Sponsored by: Klara, Inc. Sponsored by: Modirum Differential Revision: https://reviews.freebsd.org/D45672 --- sys/net/pfvar.h | 5 ++-- sys/netpfil/pf/pf.c | 18 ++++++++++--- sys/netpfil/pf/pf_lb.c | 57 ++++++++++++++++++++++++++++-------------- 3 files changed, 55 insertions(+), 25 deletions(-) diff --git a/sys/net/pfvar.h b/sys/net/pfvar.h index 863883c2d61e..d66e6f799761 100644 --- a/sys/net/pfvar.h +++ b/sys/net/pfvar.h @@ -2569,11 +2569,12 @@ u_short pf_map_addr(u_int8_t, struct pf_krule *, struct pf_addr *, struct pf_addr *, struct pfi_kkif **nkif, struct pf_addr *, struct pf_ksrc_node **); -struct pf_krule *pf_get_translation(struct pf_pdesc *, struct mbuf *, +u_short pf_get_translation(struct pf_pdesc *, struct mbuf *, int, struct pfi_kkif *, struct pf_ksrc_node **, struct pf_state_key **, struct pf_state_key **, struct pf_addr *, struct pf_addr *, - uint16_t, uint16_t, struct pf_kanchor_stackframe *); + uint16_t, uint16_t, struct pf_kanchor_stackframe *, + struct pf_krule **); struct pf_state_key *pf_state_key_setup(struct pf_pdesc *, struct pf_addr *, struct pf_addr *, u_int16_t, u_int16_t); diff --git a/sys/netpfil/pf/pf.c b/sys/netpfil/pf/pf.c index 0547e29e04c2..2bbd231b3ee9 100644 --- a/sys/netpfil/pf/pf.c +++ b/sys/netpfil/pf/pf.c @@ -4605,7 +4605,7 @@ pf_test_rule(struct pf_krule **rm, struct pf_kstate **sm, struct pfi_kkif *kif, struct pf_ksrc_node *nsn = NULL; struct tcphdr *th = &pd->hdr.tcp; struct pf_state_key *sk = NULL, *nk = NULL; - u_short reason; + u_short reason, transerror; int rewrite = 0, hdrlen = 0; int tag = -1; int asd = 0; @@ -4618,6 +4618,8 @@ pf_test_rule(struct pf_krule **rm, struct pf_kstate **sm, struct pfi_kkif *kif, PF_RULES_RASSERT(); + SLIST_INIT(&match_rules); + if (inp != NULL) { INP_LOCK_ASSERT(inp); pd->lookup.uid = inp->inp_cred->cr_uid; @@ -4686,8 +4688,17 @@ pf_test_rule(struct pf_krule **rm, struct pf_kstate **sm, struct pfi_kkif *kif, r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr); /* check packet for BINAT/NAT/RDR */ - if ((nr = pf_get_translation(pd, m, off, kif, &nsn, &sk, - &nk, saddr, daddr, sport, dport, anchor_stack)) != NULL) { + transerror = pf_get_translation(pd, m, off, kif, &nsn, &sk, + &nk, saddr, daddr, sport, dport, anchor_stack, &nr); + switch (transerror) { + default: + /* A translation error occurred. */ + REASON_SET(&reason, transerror); + goto cleanup; + case PFRES_MAX: + /* No match. */ + break; + case PFRES_MATCH: KASSERT(sk != NULL, ("%s: null sk", __func__)); KASSERT(nk != NULL, ("%s: null nk", __func__)); @@ -4836,7 +4847,6 @@ pf_test_rule(struct pf_krule **rm, struct pf_kstate **sm, struct pfi_kkif *kif, pd->nat_rule = nr; } - SLIST_INIT(&match_rules); while (r != NULL) { pf_counter_u64_add(&r->evaluations, 1); if (pfi_kkif_match(r->kif, kif) == r->ifnot) diff --git a/sys/netpfil/pf/pf_lb.c b/sys/netpfil/pf/pf_lb.c index 4b703d3d02da..68fc76233dab 100644 --- a/sys/netpfil/pf/pf_lb.c +++ b/sys/netpfil/pf/pf_lb.c @@ -591,22 +591,26 @@ pf_map_addr(sa_family_t af, struct pf_krule *r, struct pf_addr *saddr, return (reason); } -struct pf_krule * +u_short pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off, struct pfi_kkif *kif, struct pf_ksrc_node **sn, struct pf_state_key **skp, struct pf_state_key **nkp, struct pf_addr *saddr, struct pf_addr *daddr, - uint16_t sport, uint16_t dport, struct pf_kanchor_stackframe *anchor_stack) + uint16_t sport, uint16_t dport, struct pf_kanchor_stackframe *anchor_stack, + struct pf_krule **rp) { struct pf_krule *r = NULL; struct pf_addr *naddr; uint16_t *nportp; uint16_t low, high; + u_short reason; PF_RULES_RASSERT(); KASSERT(*skp == NULL, ("*skp not NULL")); KASSERT(*nkp == NULL, ("*nkp not NULL")); + *rp = NULL; + if (pd->dir == PF_OUT) { r = pf_match_translation(pd, m, off, kif, saddr, sport, daddr, dport, PF_RULESET_BINAT, anchor_stack); @@ -624,23 +628,23 @@ pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off, } if (r == NULL) - return (NULL); + return (PFRES_MAX); switch (r->action) { case PF_NONAT: case PF_NOBINAT: case PF_NORDR: - return (NULL); + return (PFRES_MAX); } *skp = pf_state_key_setup(pd, saddr, daddr, sport, dport); if (*skp == NULL) - return (NULL); + return (PFRES_MEMORY); *nkp = pf_state_key_clone(*skp); if (*nkp == NULL) { uma_zfree(V_pf_state_key_z, *skp); *skp = NULL; - return (NULL); + return (PFRES_MEMORY); } naddr = &(*nkp)->addr[1]; @@ -664,6 +668,7 @@ pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off, r->rpool.mape.offset, r->rpool.mape.psidlen, r->rpool.mape.psid)); + reason = PFRES_MAPFAILED; goto notrans; } } else if (pf_get_sport(pd->af, pd->proto, r, saddr, sport, @@ -671,6 +676,7 @@ pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off, DPFPRINTF(PF_DEBUG_MISC, ("pf: NAT proxy port allocation (%u-%u) failed\n", r->rpool.proxy_port[0], r->rpool.proxy_port[1])); + reason = PFRES_MAPFAILED; goto notrans; } break; @@ -682,8 +688,10 @@ pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off, #ifdef INET case AF_INET: if (r->rpool.cur->addr.p.dyn-> - pfid_acnt4 < 1) + pfid_acnt4 < 1) { + reason = PFRES_MAPFAILED; goto notrans; + } PF_POOLMASK(naddr, &r->rpool.cur->addr.p.dyn-> pfid_addr4, @@ -694,8 +702,10 @@ pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off, #ifdef INET6 case AF_INET6: if (r->rpool.cur->addr.p.dyn-> - pfid_acnt6 < 1) + pfid_acnt6 < 1) { + reason = PFRES_MAPFAILED; goto notrans; + } PF_POOLMASK(naddr, &r->rpool.cur->addr.p.dyn-> pfid_addr6, @@ -715,8 +725,10 @@ pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off, switch (pd->af) { #ifdef INET case AF_INET: - if (r->src.addr.p.dyn-> pfid_acnt4 < 1) + if (r->src.addr.p.dyn->pfid_acnt4 < 1) { + reason = PFRES_MAPFAILED; goto notrans; + } PF_POOLMASK(naddr, &r->src.addr.p.dyn->pfid_addr4, &r->src.addr.p.dyn->pfid_mask4, @@ -725,8 +737,10 @@ pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off, #endif /* INET */ #ifdef INET6 case AF_INET6: - if (r->src.addr.p.dyn->pfid_acnt6 < 1) + if (r->src.addr.p.dyn->pfid_acnt6 < 1) { + reason = PFRES_MAPFAILED; goto notrans; + } PF_POOLMASK(naddr, &r->src.addr.p.dyn->pfid_addr6, &r->src.addr.p.dyn->pfid_mask6, @@ -744,7 +758,8 @@ pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off, struct pf_state_key_cmp key; uint16_t cut, low, high, nport; - if (pf_map_addr(pd->af, r, saddr, naddr, NULL, NULL, sn)) + reason = pf_map_addr(pd->af, r, saddr, naddr, NULL, NULL, sn); + if (reason != 0) goto notrans; if ((r->rpool.opts & PF_POOL_TYPEMASK) == PF_POOL_BITMASK) PF_POOLMASK(naddr, naddr, &r->rpool.cur->addr.v.a.mask, @@ -815,12 +830,13 @@ pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off, DPFPRINTF(PF_DEBUG_MISC, ("pf: RDR source port allocation failed\n")); - if (0) { + reason = PFRES_MAPFAILED; + goto notrans; + out: - DPFPRINTF(PF_DEBUG_MISC, - ("pf: RDR source port allocation %u->%u\n", - ntohs(sport), ntohs((*nkp)->port[0]))); - } + DPFPRINTF(PF_DEBUG_MISC, + ("pf: RDR source port allocation %u->%u\n", + ntohs(sport), ntohs((*nkp)->port[0]))); break; } default: @@ -828,14 +844,17 @@ pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off, } /* Return success only if translation really happened. */ - if (bcmp(*skp, *nkp, sizeof(struct pf_state_key_cmp))) - return (r); + if (bcmp(*skp, *nkp, sizeof(struct pf_state_key_cmp))) { + *rp = r; + return (PFRES_MATCH); + } + reason = PFRES_MAX; notrans: uma_zfree(V_pf_state_key_z, *nkp); uma_zfree(V_pf_state_key_z, *skp); *skp = *nkp = NULL; *sn = NULL; - return (NULL); + return (reason); } From aa141adc039a5418d7b7800094115d861baf91a0 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Mon, 19 Aug 2024 14:20:03 +0000 Subject: [PATCH 026/145] socket: Split up soreceive_stream() Factor out the bits that run with the sock I/O lock held into a separate function. No functional change intended. Reviewed by: gallatin, glebius MFC after: 2 weeks Sponsored by: Klara, Inc. Sponsored by: Stormshield Differential Revision: https://reviews.freebsd.org/D46303 --- sys/kern/uipc_socket.c | 109 +++++++++++++++++++++++------------------ 1 file changed, 62 insertions(+), 47 deletions(-) diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c index 9a44c1d557f0..5886c7ac84f4 100644 --- a/sys/kern/uipc_socket.c +++ b/sys/kern/uipc_socket.c @@ -2566,56 +2566,15 @@ soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio, /* * Optimized version of soreceive() for stream (TCP) sockets. */ -int -soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio, - struct mbuf **mp0, struct mbuf **controlp, int *flagsp) +static int +soreceive_stream_locked(struct socket *so, struct sockbuf *sb, + struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, + struct mbuf **controlp, int flags) { - int len = 0, error = 0, flags, oresid; - struct sockbuf *sb; + int len = 0, error = 0, oresid; struct mbuf *m, *n = NULL; - /* We only do stream sockets. */ - if (so->so_type != SOCK_STREAM) - return (EINVAL); - if (psa != NULL) - *psa = NULL; - if (flagsp != NULL) - flags = *flagsp &~ MSG_EOR; - else - flags = 0; - if (controlp != NULL) - *controlp = NULL; - if (flags & MSG_OOB) - return (soreceive_rcvoob(so, uio, flags)); - if (mp0 != NULL) - *mp0 = NULL; - - sb = &so->so_rcv; - -#ifdef KERN_TLS - /* - * KTLS store TLS records as records with a control message to - * describe the framing. - * - * We check once here before acquiring locks to optimize the - * common case. - */ - if (sb->sb_tls_info != NULL) - return (soreceive_generic(so, psa, uio, mp0, controlp, - flagsp)); -#endif - - /* Prevent other readers from entering the socket. */ - error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags)); - if (error) - return (error); -#ifdef KERN_TLS - if (__predict_false(sb->sb_tls_info != NULL)) { - SOCK_IO_RECV_UNLOCK(so); - return (soreceive_generic(so, psa, uio, mp0, controlp, - flagsp)); - } -#endif + SOCK_IO_RECV_ASSERT_LOCKED(so); SOCKBUF_LOCK(sb); /* Easy one, no space to copyout anything. */ @@ -2778,6 +2737,62 @@ soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio, SBLASTRECORDCHK(sb); SBLASTMBUFCHK(sb); SOCKBUF_UNLOCK(sb); + return (error); +} + +int +soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio, + struct mbuf **mp0, struct mbuf **controlp, int *flagsp) +{ + struct sockbuf *sb; + int error, flags; + + sb = &so->so_rcv; + + /* We only do stream sockets. */ + if (so->so_type != SOCK_STREAM) + return (EINVAL); + if (psa != NULL) + *psa = NULL; + if (flagsp != NULL) + flags = *flagsp & ~MSG_EOR; + else + flags = 0; + if (controlp != NULL) + *controlp = NULL; + if (flags & MSG_OOB) + return (soreceive_rcvoob(so, uio, flags)); + if (mp0 != NULL) + *mp0 = NULL; + +#ifdef KERN_TLS + /* + * KTLS store TLS records as records with a control message to + * describe the framing. + * + * We check once here before acquiring locks to optimize the + * common case. + */ + if (sb->sb_tls_info != NULL) + return (soreceive_generic(so, psa, uio, mp0, controlp, + flagsp)); +#endif + + /* + * Prevent other threads from reading from the socket. This lock may be + * dropped in order to sleep waiting for data to arrive. + */ + error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags)); + if (error) + return (error); +#ifdef KERN_TLS + if (__predict_false(sb->sb_tls_info != NULL)) { + SOCK_IO_RECV_UNLOCK(so); + return (soreceive_generic(so, psa, uio, mp0, controlp, + flagsp)); + } +#endif + error = soreceive_stream_locked(so, sb, psa, uio, mp0, controlp, flags); SOCK_IO_RECV_UNLOCK(so); return (error); } From 0a68f644dca19670686007071479f919a56ea37f Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Mon, 19 Aug 2024 14:20:19 +0000 Subject: [PATCH 027/145] socket: Split up soreceive_generic() Factor out the bits that run with the sock I/O lock held into a separate function. No functional change intended. Reviewed by: gallatin, glebius MFC after: 2 weeks Sponsored by: Klara, Inc. Sponsored by: Stormshield Differential Revision: https://reviews.freebsd.org/D46304 --- sys/kern/uipc_socket.c | 51 ++++++++++++++++++++++++++++-------------- 1 file changed, 34 insertions(+), 17 deletions(-) diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c index 5886c7ac84f4..e7c4a85d5970 100644 --- a/sys/kern/uipc_socket.c +++ b/sys/kern/uipc_socket.c @@ -2070,11 +2070,11 @@ sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord) * mbuf **mp0 for use in returning the chain. The uio is then used only for * the count in uio_resid. */ -int -soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio, - struct mbuf **mp0, struct mbuf **controlp, int *flagsp) +static int +soreceive_generic_locked(struct socket *so, struct sockaddr **psa, + struct uio *uio, struct mbuf **mp, struct mbuf **controlp, int *flagsp) { - struct mbuf *m, **mp; + struct mbuf *m; int flags, error, offset; ssize_t len; struct protosw *pr = so->so_proto; @@ -2083,25 +2083,15 @@ soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio, ssize_t orig_resid = uio->uio_resid; bool report_real_len = false; - mp = mp0; - if (psa != NULL) - *psa = NULL; - if (controlp != NULL) - *controlp = NULL; + SOCK_IO_RECV_ASSERT_LOCKED(so); + + error = 0; if (flagsp != NULL) { report_real_len = *flagsp & MSG_TRUNC; *flagsp &= ~MSG_TRUNC; flags = *flagsp &~ MSG_EOR; } else flags = 0; - if (flags & MSG_OOB) - return (soreceive_rcvoob(so, uio, flags)); - if (mp != NULL) - *mp = NULL; - - error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags)); - if (error) - return (error); restart: SOCKBUF_LOCK(&so->so_rcv); @@ -2559,6 +2549,33 @@ soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio, if (flagsp != NULL) *flagsp |= flags; release: + return (error); +} + +int +soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio, + struct mbuf **mp, struct mbuf **controlp, int *flagsp) +{ + int error, flags; + + if (psa != NULL) + *psa = NULL; + if (controlp != NULL) + *controlp = NULL; + if (flagsp != NULL) { + flags = *flagsp; + if ((flags & MSG_OOB) != 0) + return (soreceive_rcvoob(so, uio, flags)); + } else { + flags = 0; + } + if (mp != NULL) + *mp = NULL; + + error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags)); + if (error) + return (error); + error = soreceive_generic_locked(so, psa, uio, mp, controlp, flagsp); SOCK_IO_RECV_UNLOCK(so); return (error); } From fb901935f257ddcc492fe9efb605797f181c6597 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Mon, 19 Aug 2024 14:20:43 +0000 Subject: [PATCH 028/145] socket: Split up sosend_generic() Factor out the bits that run with the sock I/O lock held into a separate function. In this implementation, we are doing a bit more work under the I/O lock than before. However, lock contention is only a problem when multiple threads are transmitting on the same socket, which is an unusual case that is not expected to perform well in any case. No functional change intended. Reviewed by: gallatin, glebius MFC after: 2 weeks Sponsored by: Klara, Inc. Sponsored by: Stormshield Differential Revision: https://reviews.freebsd.org/D46305 --- sys/kern/uipc_socket.c | 47 ++++++++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 18 deletions(-) diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c index e7c4a85d5970..a5e88fc7ffc6 100644 --- a/sys/kern/uipc_socket.c +++ b/sys/kern/uipc_socket.c @@ -1656,8 +1656,8 @@ sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio, * counts if EINTR/ERESTART are returned. Data and control buffers are freed * on return. */ -int -sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, +static int +sosend_generic_locked(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { long space; @@ -1673,6 +1673,9 @@ sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, tls = NULL; tls_rtype = TLS_RLTYPE_APP; #endif + + SOCK_IO_SEND_ASSERT_LOCKED(so); + if (uio != NULL) resid = uio->uio_resid; else if ((top->m_flags & M_PKTHDR) != 0) @@ -1702,10 +1705,6 @@ sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, if (control != NULL) clen = control->m_len; - error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags)); - if (error) - goto out; - #ifdef KERN_TLS tls_send_flag = 0; tls = ktls_hold(so->so_snd.sb_tls_info); @@ -1728,7 +1727,7 @@ sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, if (resid == 0 && !ktls_permit_empty_frames(tls)) { error = EINVAL; - goto release; + goto out; } } #endif @@ -1739,13 +1738,13 @@ sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, if (so->so_snd.sb_state & SBS_CANTSENDMORE) { SOCKBUF_UNLOCK(&so->so_snd); error = EPIPE; - goto release; + goto out; } if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_snd); - goto release; + goto out; } if ((so->so_state & SS_ISCONNECTED) == 0) { /* @@ -1759,7 +1758,7 @@ sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, if (!(resid == 0 && clen != 0)) { SOCKBUF_UNLOCK(&so->so_snd); error = ENOTCONN; - goto release; + goto out; } } else if (addr == NULL) { SOCKBUF_UNLOCK(&so->so_snd); @@ -1767,7 +1766,7 @@ sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, error = ENOTCONN; else error = EDESTADDRREQ; - goto release; + goto out; } } space = sbspace(&so->so_snd); @@ -1777,7 +1776,7 @@ sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, clen > so->so_snd.sb_hiwat) { SOCKBUF_UNLOCK(&so->so_snd); error = EMSGSIZE; - goto release; + goto out; } if (space < resid + clen && (atomic || space < so->so_snd.sb_lowat || space < clen)) { @@ -1785,12 +1784,12 @@ sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) { SOCKBUF_UNLOCK(&so->so_snd); error = EWOULDBLOCK; - goto release; + goto out; } error = sbwait(so, SO_SND); SOCKBUF_UNLOCK(&so->so_snd); if (error) - goto release; + goto out; goto restart; } SOCKBUF_UNLOCK(&so->so_snd); @@ -1835,7 +1834,7 @@ sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, ((flags & MSG_EOR) ? M_EOR : 0)); if (top == NULL) { error = EFAULT; /* only possible error */ - goto release; + goto out; } space -= resid - uio->uio_resid; resid = uio->uio_resid; @@ -1899,12 +1898,10 @@ sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, control = NULL; top = NULL; if (error) - goto release; + goto out; } while (resid && space > 0); } while (resid); -release: - SOCK_IO_SEND_UNLOCK(so); out: #ifdef KERN_TLS if (tls != NULL) @@ -1917,6 +1914,20 @@ sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, return (error); } +int +sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, + struct mbuf *top, struct mbuf *control, int flags, struct thread *td) +{ + int error; + + error = SOCK_IO_SEND_LOCK(so, 0); + if (error) + return (error); + error = sosend_generic_locked(so, addr, uio, top, control, flags, td); + SOCK_IO_SEND_UNLOCK(so); + return (error); +} + /* * Send to a socket from a kernel thread. * From 6982be38cb7e4254ff4ffbb334cd7e234b5f6cc2 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Fri, 5 Jul 2024 14:01:29 -0400 Subject: [PATCH 029/145] socket: Microoptimize soreceive_stream_locked() There is no need to hold the sockbuf lock while checking uio_resid. No functional change intended. MFC after: 2 weeks Sponsored by: Klara, Inc. Sponsored by: Stormshield --- sys/kern/uipc_socket.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c index a5e88fc7ffc6..13b6253bd115 100644 --- a/sys/kern/uipc_socket.c +++ b/sys/kern/uipc_socket.c @@ -2604,14 +2604,12 @@ soreceive_stream_locked(struct socket *so, struct sockbuf *sb, SOCK_IO_RECV_ASSERT_LOCKED(so); - SOCKBUF_LOCK(sb); /* Easy one, no space to copyout anything. */ - if (uio->uio_resid == 0) { - error = EINVAL; - goto out; - } + if (uio->uio_resid == 0) + return (EINVAL); oresid = uio->uio_resid; + SOCKBUF_LOCK(sb); /* We will never ever get anything unless we are or were connected. */ if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { error = ENOTCONN; From d02dcf21eea3973a714294b011537c2af6c747fa Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Mon, 19 Aug 2024 15:22:03 +0000 Subject: [PATCH 030/145] pkgbase: Make src package creation recipes more precise Just remove the plist created by the respective rule. Otherwise the two receipes can race with each other. Fixes: d7d5c9efef03 ("pkgbase: Let source packages be built in parallel") Reviewed by: bapt, emaste Reported by: Mark Millard Differential Revision: https://reviews.freebsd.org/D46320 --- Makefile.inc1 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile.inc1 b/Makefile.inc1 index 4df7f1900cf4..5400cb5e734a 100644 --- a/Makefile.inc1 +++ b/Makefile.inc1 @@ -2120,7 +2120,7 @@ create-packages-source: _pkgbootstrap _repodir .PHONY create-packages: .PHONY create-packages-world create-packages-kernel create-packages-source create-source-src-package: _pkgbootstrap .PHONY - rm -f ${SSTAGEDIR}/*.plist 2>/dev/null || : + rm -f ${SSTAGEDIR}/src.plist 2>/dev/null || : .if !empty(GIT_CMD) && exists(${GIT_CMD}) && exists(${SRCDIR}/.git) @cd ${SRCDIR}; \ ( echo "@override_prefix /usr/src" ; \ @@ -2147,7 +2147,7 @@ create-source-src-package: _pkgbootstrap .PHONY .endif create-source-src-sys-package: _pkgbootstrap .PHONY - rm -f ${SSTAGEDIR}/*.plist 2>/dev/null || : + rm -f ${SSTAGEDIR}/src-sys.plist 2>/dev/null || : .if !empty(GIT_CMD) && exists(${GIT_CMD}) && exists(${SRCDIR}/.git) @cd ${SRCDIR}; \ ( echo "@override_prefix /usr/src" ; \ From 6a88e22728d285c4df17216515ce2b8d1e5a6835 Mon Sep 17 00:00:00 2001 From: Kristof Provost Date: Fri, 16 Aug 2024 14:55:31 +0200 Subject: [PATCH 031/145] pfctl: pfik_ifp is always NULL The pfik_ifp field is not provided by the kernel, it is always NULL. Do not check for it. This caused us to not clear the skip flag on interfaces, leading to unexpected behaviour when a 'set skip' was removed. PR: 280834 Sponsored by: Rubicon Communications, LLC ("Netgate") Differential Revision: https://reviews.freebsd.org/D46311 --- sbin/pfctl/pfctl.c | 7 +--- tests/sys/netpfil/pf/set_skip.sh | 61 ++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+), 6 deletions(-) diff --git a/sbin/pfctl/pfctl.c b/sbin/pfctl/pfctl.c index b60e64fba338..45bfdf31f8dc 100644 --- a/sbin/pfctl/pfctl.c +++ b/sbin/pfctl/pfctl.c @@ -394,8 +394,6 @@ pfctl_check_skip_ifaces(char *ifname) continue; for (n = h; n != NULL; n = n->next) { - if (p->pfik_ifp == NULL) - continue; if (strncmp(p->pfik_name, ifname, IFNAMSIZ)) continue; @@ -422,9 +420,6 @@ pfctl_adjust_skip_ifaces(struct pfctl *pf) for (n = h; n != NULL; n = n->next) PFRB_FOREACH(pp, &skip_b) { - if (pp->pfik_ifp == NULL) - continue; - if (strncmp(pp->pfik_name, n->ifname, IFNAMSIZ)) continue; @@ -437,7 +432,7 @@ pfctl_adjust_skip_ifaces(struct pfctl *pf) } PFRB_FOREACH(p, &skip_b) { - if (p->pfik_ifp == NULL || ! (p->pfik_flags & PFI_IFLAG_SKIP)) + if (! (p->pfik_flags & PFI_IFLAG_SKIP)) continue; pfctl_set_interface_flags(pf, p->pfik_name, PFI_IFLAG_SKIP, 0); diff --git a/tests/sys/netpfil/pf/set_skip.sh b/tests/sys/netpfil/pf/set_skip.sh index e5b1440360e9..e984377721b8 100644 --- a/tests/sys/netpfil/pf/set_skip.sh +++ b/tests/sys/netpfil/pf/set_skip.sh @@ -26,6 +26,50 @@ . $(atf_get_srcdir)/utils.subr +atf_test_case "unset" "cleanup" +unset_head() +{ + atf_set descr 'Unset set skip test' + atf_set require.user root +} + +unset_body() +{ + pft_init + + vnet_mkjail alcatraz + jexec alcatraz ifconfig lo0 127.0.0.1/8 up + jexec alcatraz pfctl -e + pft_set_rules alcatraz "set skip on lo0" \ + "block in proto icmp" + + echo "set skip" + jexec alcatraz pfctl -v -sI + + jexec alcatraz ifconfig + atf_check -s exit:0 -o ignore jexec alcatraz ping -c 1 127.0.0.1 + + # Unset the skip on the group + pft_set_rules noflush alcatraz \ + "block in proto icmp" + + echo "No setskip" + jexec alcatraz pfctl -v -sI + + # Do flush states + jexec alcatraz pfctl -Fs + + # And now our ping is blocked + atf_check -s exit:2 -o ignore jexec alcatraz ping -c 1 127.0.0.1 + + jexec alcatraz pfctl -v -sI +} + +unset_cleanup() +{ + pft_cleanup +} + atf_test_case "set_skip_group" "cleanup" set_skip_group_head() { @@ -45,8 +89,24 @@ set_skip_group_body() pft_set_rules alcatraz "set skip on foo" \ "block in proto icmp" + echo "set skip" + jexec alcatraz pfctl -v -sI + jexec alcatraz ifconfig atf_check -s exit:0 -o ignore jexec alcatraz ping -c 1 127.0.0.1 + + # Unset the skip on the group + pft_set_rules noflush alcatraz \ + "block in proto icmp" + + # Do flush states + jexec alcatraz pfctl -Fs + + # And now our ping is blocked + atf_check -s exit:2 -o ignore jexec alcatraz ping -c 1 127.0.0.1 + + echo "No setskip" + jexec alcatraz pfctl -v -sI } set_skip_group_cleanup() @@ -163,6 +223,7 @@ pr255852_cleanup() atf_init_test_cases() { + atf_add_test_case "unset" atf_add_test_case "set_skip_group" atf_add_test_case "set_skip_group_lo" atf_add_test_case "set_skip_dynamic" From 22a632c366a98692d7114135241c10f154e52a76 Mon Sep 17 00:00:00 2001 From: Igor Ostapenko Date: Fri, 16 Aug 2024 16:49:06 +0200 Subject: [PATCH 032/145] pf: Make pf_test6 handle m_len < sizeof(struct ip6_hdr) case Reviewed by: kp Differential Revision: https://reviews.freebsd.org/D46312 --- sys/netpfil/pf/pf.c | 8 +++++ tests/sys/netpfil/pf/mbuf.sh | 61 ++++++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+) diff --git a/sys/netpfil/pf/pf.c b/sys/netpfil/pf/pf.c index 2bbd231b3ee9..9b1601ac0ee5 100644 --- a/sys/netpfil/pf/pf.c +++ b/sys/netpfil/pf/pf.c @@ -8946,6 +8946,14 @@ pf_test6(int dir, int pflags, struct ifnet *ifp, struct mbuf **m0, struct inpcb pd.af = AF_INET6; pd.act.rtableid = -1; + if (__predict_false(m->m_len < sizeof(struct ip6_hdr)) && + (m = *m0 = m_pullup(*m0, sizeof(struct ip6_hdr))) == NULL) { + DPFPRINTF(PF_DEBUG_URGENT, + ("pf_test6: m_len < sizeof(struct ip6_hdr)" + ", pullup failed\n")); + PF_RULES_RUNLOCK(); + return (PF_DROP); + } h = mtod(m, struct ip6_hdr *); off = ((caddr_t)h - m->m_data) + sizeof(struct ip6_hdr); diff --git a/tests/sys/netpfil/pf/mbuf.sh b/tests/sys/netpfil/pf/mbuf.sh index 082de08b0838..2dffa48ed2f5 100644 --- a/tests/sys/netpfil/pf/mbuf.sh +++ b/tests/sys/netpfil/pf/mbuf.sh @@ -91,7 +91,68 @@ inet_in_mbuf_len_cleanup() pft_cleanup } +atf_test_case "inet6_in_mbuf_len" "cleanup" +inet6_in_mbuf_len_head() +{ + atf_set descr 'Test that pf can handle inbound with the first mbuf with m_len < sizeof(struct ip6_hdr)' + atf_set require.user root +} +inet6_in_mbuf_len_body() +{ + pft_init + dummymbuf_init + + epair=$(vnet_mkepair) + ifconfig ${epair}a inet6 2001:db8::1/64 up no_dad + + # Set up a simple jail with one interface + vnet_mkjail alcatraz ${epair}b + jexec alcatraz ifconfig ${epair}b inet6 2001:db8::2/64 up no_dad + + # Sanity check + atf_check -s exit:0 -o ignore ping -c1 2001:db8::2 + + # Should be denied + jexec alcatraz pfctl -e + pft_set_rules alcatraz \ + "block" \ + "pass quick inet6 proto icmp6 icmp6-type { neighbrsol, neighbradv }" + atf_check -s not-exit:0 -o ignore ping -c1 -t1 2001:db8::2 + + # Should be allowed by from/to addresses + pft_set_rules alcatraz \ + "block" \ + "pass quick inet6 proto icmp6 icmp6-type { neighbrsol, neighbradv }" \ + "pass in inet6 from 2001:db8::1 to 2001:db8::2" + atf_check -s exit:0 -o ignore ping -c1 2001:db8::2 + + # Should still work for m_len=0 + jexec alcatraz pfilctl link -i dummymbuf:inet6 inet6 + jexec alcatraz sysctl net.dummymbuf.rules="inet6 in ${epair}b pull-head 0;" + atf_check_equal "0" "$(jexec alcatraz sysctl -n net.dummymbuf.hits)" + atf_check -s exit:0 -o ignore ping -c1 2001:db8::2 + atf_check_equal "1" "$(jexec alcatraz sysctl -n net.dummymbuf.hits)" + + # m_len=1 + jexec alcatraz sysctl net.dummymbuf.rules="inet6 in ${epair}b pull-head 1;" + jexec alcatraz sysctl net.dummymbuf.hits=0 + atf_check -s exit:0 -o ignore ping -c1 2001:db8::2 + atf_check_equal "1" "$(jexec alcatraz sysctl -n net.dummymbuf.hits)" + + # m_len=39 + # provided IPv6 basic header is 40 bytes long, it should impact the dst addr + jexec alcatraz sysctl net.dummymbuf.rules="inet6 in ${epair}b pull-head 39;" + jexec alcatraz sysctl net.dummymbuf.hits=0 + atf_check -s exit:0 -o ignore ping -c1 2001:db8::2 + atf_check_equal "1" "$(jexec alcatraz sysctl -n net.dummymbuf.hits)" +} +inet6_in_mbuf_len_cleanup() +{ + pft_cleanup +} + atf_init_test_cases() { atf_add_test_case "inet_in_mbuf_len" + atf_add_test_case "inet6_in_mbuf_len" } From d06fe346eccf0919a29d43599548e49c0d6a7a17 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Mon, 19 Aug 2024 16:02:26 +0000 Subject: [PATCH 033/145] libgeom: Avoid fixed remappings of the devstat device libgeom maintains a quasi-private mapping of /dev/devstat, which might grow over time if new devices appear. When the mapping needs to be expanded, the old mapping is passed as a hint, but this appears to be unnecessary. Simplify and improve things a bit: - stop passing a hint when remapping, - don't creat a mapping in geom_stats_open(), as geom_stats_resync() will create it for us, - check for errors from munmap(). Reviewed by: imp, asomers Tested by: asomers MFC after: 2 weeks Sponsored by: Innovate UK Differential Revision: https://reviews.freebsd.org/D46294 --- lib/libgeom/geom_stats.c | 34 ++++++++++++---------------------- 1 file changed, 12 insertions(+), 22 deletions(-) diff --git a/lib/libgeom/geom_stats.c b/lib/libgeom/geom_stats.c index 7ae5c947b7b1..510636eb9a8b 100644 --- a/lib/libgeom/geom_stats.c +++ b/lib/libgeom/geom_stats.c @@ -54,9 +54,12 @@ geom_stats_close(void) { if (statsfd == -1) return; - munmap(statp, npages * pagesize); - statp = NULL; - close (statsfd); + if (statp != NULL) { + if (munmap(statp, npages * pagesize) != 0) + err(1, "munmap"); + statp = NULL; + } + close(statsfd); statsfd = -1; } @@ -73,22 +76,18 @@ geom_stats_resync(void) if (error) err(1, "DIOCGMEDIASIZE(" _PATH_DEV DEVSTAT_DEVICE_NAME ")"); - munmap(statp, npages * pagesize); - p = mmap(statp, mediasize, PROT_READ, MAP_SHARED, statsfd, 0); + if (statp != NULL && munmap(statp, npages * pagesize) != 0) + err(1, "munmap"); + p = mmap(NULL, mediasize, PROT_READ, MAP_SHARED, statsfd, 0); if (p == MAP_FAILED) - err(1, "mmap(/dev/devstat):"); - else { - statp = p; - npages = mediasize / pagesize; - } + err(1, "mmap(/dev/devstat)"); + statp = p; + npages = mediasize / pagesize; } int geom_stats_open(void) { - int error; - void *p; - if (statsfd != -1) return (EBUSY); statsfd = open(_PATH_DEV DEVSTAT_DEVICE_NAME, O_RDONLY); @@ -96,15 +95,6 @@ geom_stats_open(void) return (errno); pagesize = getpagesize(); spp = pagesize / sizeof(struct devstat); - p = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, statsfd, 0); - if (p == MAP_FAILED) { - error = errno; - close(statsfd); - statsfd = -1; - errno = error; - return (error); - } - statp = p; npages = 1; geom_stats_resync(); return (0); From b9d1249b5b686dad0346e7d211693456c64049ec Mon Sep 17 00:00:00 2001 From: Baptiste Daroussin Date: Mon, 19 Aug 2024 18:10:55 +0200 Subject: [PATCH 034/145] pci_vendors: update to 2024.06.23 --- share/misc/pci_vendors | 238 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 215 insertions(+), 23 deletions(-) diff --git a/share/misc/pci_vendors b/share/misc/pci_vendors index df152a4e97e0..f168678909c8 100644 --- a/share/misc/pci_vendors +++ b/share/misc/pci_vendors @@ -1,8 +1,8 @@ # # List of PCI ID's # -# Version: 2024.05.14 -# Date: 2024-05-14 03:15:02 +# Version: 2024.06.23 +# Date: 2024-06-23 03:15:02 # # Maintained by Albert Pool, Martin Mares, and other volunteers from # the PCI ID Project at https://pci-ids.ucw.cz/. @@ -104,6 +104,17 @@ 025e d81d NVMe DC SSD E1.L 9.5mm [D5-P5336] 0b70 NVMe DC SSD [Yorktown controller] 2b59 NVMe DC SSD [Atomos Prime] + 025e 0008 NVMe DC SSD U.2-SFF 15mm [D7-PS1010] + 025e 0019 NVMe DC SSD E3.S-1T 7.5mm [D7-PS1010] + 025e 0108 NVMe DC SSD U.2-SFF 15mm [D7-PS1030] + 025e 0119 NVMe DC SSD E3.S-1T 7.5mm [D7-PS1030] + 108e 48a0 NVMe DC SSD U.2-SFF 15mm 3.84TB [D7-PS1010 Custom] + 108e 48a1 NVMe DC SSD U.2-SFF 15mm 7.68TB [D7-PS1010 Custom] + 108e 48a2 NVMe DC SSD U.2-SFF 15mm 15.36TB [D7-PS1010 Custom] + 108e 48a3 NVMe DC SSD Add-In-Card [D7-PS1030 Custom] + 108e 48a4 NVMe DC SSD E3.S-1T 7.5mm 3.84TB [D7-PS1010 Custom] + 108e 48a5 NVMe DC SSD E3.S-1T 7.5mm 7.68TB [D7-PS1010 Custom] + 108e 48a6 NVMe DC SSD E3.S-1T 7.5mm 15.36TB [D7-PS1010 Custom] f1ab P41 Plus NVMe SSD (DRAM-less) [Echo Harbor] f1ac P44 Pro NVMe SSD [Hollywood Beach] 0270 Hauppauge computer works Inc. (Wrong ID) @@ -616,6 +627,7 @@ 1bd4 000e 6G SAS2008IR 1bd4 000f 6G SAS2008IT SA5248 1bd4 0010 6G SAS2008IR SA5248 + 4c52 96c8 LRSA96C8 8-Port SATA3(6Gb/s)Exchange Adapter (with Raid) 8086 350f RMS2LL040 RAID Controller 8086 3700 SSD 910 Series 0073 MegaRAID SAS 2008 [Falcon] @@ -1082,6 +1094,12 @@ 10e4 MegaRAID 12GSAS/PCIe Unsupported SAS38xx 10e5 MegaRAID 12GSAS/PCIe SAS38xx 10e6 MegaRAID 12GSAS/PCIe Secure SAS38xx + 1000 04d9 3808N iMR ROMB + 1000 04da 3808N iMR ROMB + 1000 04db 3808N iMR ROMB + 1000 04dc 3808N iMR ROMB + 1000 04dd 3808N iMR ROMB + 1000 40d8 MegaRAID 9524-8i 1000 40e0 MegaRAID 9540-2M2 1028 2172 PERC H355 Adapter 1028 2173 PERC H355 Front @@ -1127,6 +1145,7 @@ 1000 a064 PEX88064 64 lane/port PCIe Gen 4 Switch 1000 a080 PEX88080 80 lane/port PCIe Gen 4 Switch 1000 a096 PEX88096 98 lane/port PCIe Gen 4.0 Switch + 4c52 9f48 LRNV9F48 4-port Built-in 8654 NVMe Switching Adapter c012 PEX880xx PCIe Gen 4 Switch # Virtual endpoint used in Broadcom synthetic PCIe switches for resource reservation 1000 100b PEX88000 PCIe Gen 4 Virtual Upstream/Downstream Port @@ -1249,7 +1268,7 @@ 103c 8b17 ProBook 445 G9/455 G9 [Ryzen 7 Integrated Radeon GPU] 15ff Fenghuang [Zhongshan Subor Z+] 1607 Arden - 1636 Renoir [Radeon RX Vega 6 (Ryzen 4000/5000 Mobile Series)] + 1636 Renoir [Radeon Vega Series / Radeon Vega Mobile Series] 1637 Renoir Radeon High Definition Audio Controller 1638 Cezanne [Radeon Vega Series / Radeon Vega Mobile Series] 1043 16c2 Radeon Vega 8 @@ -3987,6 +4006,8 @@ 1458 2408 Radeon RX 6750 XT GAMING OC 12G 1462 3980 Radeon RX 6700 XT Mech 2X 12G [MSI] 148c 2409 Red Devil RX 6700 XT +# Dual fan version + 1849 5210 Radeon RX 6700 XT Challenger D 1849 5219 Radeon RX 6700 XT Challenger D 1849 5222 RX 6700 XT Challenger D OC # Gaming 1440/QHD Overclock edition with 12 Gb GDDR6 and PCIe 4.0 of Radeon RX 6700 XT by Sapphire PULSE manufactured on autumn 2022 / C1 reviseion @@ -4022,13 +4043,17 @@ 7448 Navi 31 [Radeon Pro W7900] 744c Navi 31 [Radeon RX 7900 XT/7900 XTX/7900M] 1002 0e3b RX 7900 GRE [XFX] + 1043 0506 TUF Gaming Radeon RX 7900 XTX OC + 1849 5304 Radeon RX 7900 XTX 1da2 471e PULSE RX 7900 XTX + 1da2 475e PULSE RX 7900 GRE 1da2 e471 NITRO+ RX 7900 XTX Vapor-X 1eae 7901 RX-79XMERCB9 [SPEEDSTER MERC 310 RX 7900 XTX] 745e Navi 31 [Radeon Pro W7800] + 7460 7460 Navi32 GL-XL [AMD Radeon PRO V710] 7470 Navi 32 [Radeon PRO W7700] 747e Navi 32 [Radeon RX 7700 XT / 7800 XT] - 7480 Navi 33 [Radeon RX 7700S/7600/7600S/7600M XT/PRO W7600] + 7480 Navi 33 [Radeon RX 7600/7600 XT/7600M XT/7600S/7700S / PRO W7600] 1849 5313 RX 7600 Challenger OC 7483 Navi 33 [Radeon RX 7600M/7600M XT] 7489 Navi 33 [Radeon Pro W7500] @@ -5460,10 +5485,12 @@ 1849 43c8 Fatal1ty X370 Professional Gaming 43b6 X399 Series Chipset SATA Controller 43b7 300 Series Chipset SATA Controller + 43b8 A320 Chipset SATA Controller [AHCI mode] 43b9 X370 Series Chipset USB 3.1 xHCI Controller 1849 43d0 Fatal1ty X370 Professional Gaming 43ba X399 Series Chipset USB 3.1 xHCI Controller 43bb 300 Series Chipset USB 3.1 xHCI Controller + 43bc A320 USB 3.1 XHCI Host Controller 43c6 400 Series Chipset PCIe Bridge 43c7 400 Series Chipset PCIe Port 43c8 400 Series Chipset SATA Controller @@ -9186,12 +9213,17 @@ 8717 PEX 8717 16-lane, 8-Port PCI Express Gen 3 (8.0 GT/s) Switch with DMA 8718 PEX 8718 16-Lane, 5-Port PCI Express Gen 3 (8.0 GT/s) Switch 8724 PEX 8724 24-Lane, 6-Port PCI Express Gen 3 (8 GT/s) Switch, 19 x 19mm FCBGA + 4c52 9234 LRNV9324 2-port Built-in 8643 NVMe Exchange Adapter + 4c52 9524 LRNV9524 2-port M.2 NVMe SSD Exchange Adapter 8725 PEX 8725 24-Lane, 10-Port PCI Express Gen 3 (8.0 GT/s) Multi-Root Switch with DMA 8732 PEX 8732 32-lane, 8-Port PCI Express Gen 3 (8.0 GT/s) Switch 8734 PEX 8734 32-lane, 8-Port PCI Express Gen 3 (8.0GT/s) Switch 8747 PEX 8747 48-Lane, 5-Port PCI Express Gen 3 (8.0 GT/s) Switch + 4c52 9347 LRNV9347L 2-port Built-in 8643 NVMe Switching Adapter + 4c52 9547 LRNV9547 4-port M.2 NVMe SSD Exchange Adapter 8748 PEX 8748 48-Lane, 12-Port PCI Express Gen 3 (8 GT/s) Switch, 27 x 27mm FCBGA 8749 PEX 8749 48-Lane, 18-Port PCI Express Gen 3 (8.0 GT/s) Multi-Root Switch with DMA + 4c52 9349 LRNV9349 8-port SFF-8643 NVMe SSD Exchange Adapter 87a0 PEX PCI Express Switch NT0 Port Link Interface 87a1 PEX PCI Express Switch NT1 Port Link Interface 87b0 PEX PCI Express Switch NT0 Port Virtual Interface @@ -12959,6 +12991,8 @@ 2296 Tegra PCIe Endpoint Virtual Network 22a3 GH100 [H100 NVSwitch] 22ba AD102 High Definition Audio Controller + 22bc AD104 High Definition Audio Controller + 22bd AD106M High Definition Audio Controller 2302 GH100 2313 GH100 [H100 CNX] 2321 GH100 [H100L 94GB] @@ -13080,6 +13114,7 @@ 2681 AD102 [RTX TITAN Ada] 2684 AD102 [GeForce RTX 4090] 2685 AD102 [GeForce RTX 4090 D] + 2689 AD102 [GeForce RTX 4070 Ti SUPER] 26b1 AD102GL [RTX 6000 Ada Generation] 26b2 AD102GL [RTX 5000 Ada Generation] 26b3 AD102GL [RTX 5880 Ada Generation] @@ -13299,6 +13334,7 @@ 8043 LANai4.x [Myrinet LANai interface chip] 8062 S5933_PARASTATION 807d S5933 [Matchmaker] + 8081 GPIB interface card [IOtech Inc. PCI488] 8088 Kongsberg Spacetec Format Synchronizer 8089 Kongsberg Spacetec Serial Output Board 809c S5933_HEPC3 @@ -16238,7 +16274,9 @@ 11ae Aztech System Ltd 11af Avid Technology Inc. 0001 Cinema + ee21 Digidesign DSP Farm ee40 Digidesign Audiomedia III + ee60 Digidesign SampleCell II / II Plus 11b0 V3 Semiconductor Inc. 0002 V300PSC 0292 V292PBC [Am29030/40 Bridge] @@ -17445,7 +17483,8 @@ 10a9 8002 Acenic Gigabit Ethernet 12ae 0002 Gigabit Ethernet-T (3C986-T) 00fa Farallon PN9100-T Gigabit Ethernet -12af TDK USA Corp +12af TDK Corporation + 5831 GBDriver GX1 x2 NVMe SSD Controller (DRAM-less) 12b0 Jorge Scientific Corp 12b1 GammaLink 12b2 General Signal Networks @@ -18916,6 +18955,9 @@ 580b Secure Flash Controller (Xenon) 580d System Management Controller (Xenon) 5811 Xenos GPU (Xenon) + 5821 Xenos GPU (Zephyr/Falcon) + 5831 Xenos GPU (Jasper) + 5841 Xenos GPU (Slim) 1415 Oxford Semiconductor Ltd 8401 OX9162 Mode 1 (8-bit bus) 8403 OX9162 Mode 0 (parallel port) @@ -19859,6 +19901,7 @@ 144d a801 SM963 2.5" NVMe PCIe SSD a806 NVMe SSD SM0032L a808 NVMe SSD Controller SM981/PM981/PM983 +# Used by different variants of SSD 970 EVO and PRO 144d a801 SSD 970 EVO/PRO 1d49 403b Thinksystem U.2 PM983 NVMe SSD a809 NVMe SSD Controller 980 (DRAM-less) @@ -19886,7 +19929,7 @@ # Actually 88SS1322 according to techpowerup a80b NVMe SSD Controller PM9B1 (DRAM-less) a80c NVMe SSD Controller S4LV008[Pascal] - a80d NVMe SSD Controller PM9C1a + a80d NVMe SSD Controller PM9C1a (DRAM-less) a820 NVMe SSD Controller 171X 1028 1f95 Express Flash NVMe XS1715 SSD 400GB 1028 1f96 Express Flash NVMe XS1715 SSD 800GB @@ -20844,6 +20887,7 @@ 14e4 5250 NetXtreme-E BCM57504 4x25G KR Mezz 14e4 5425 NetXtreme-E Quad-port 25G SFP28 Ethernet OCP 3.0 Adapter (BCM957504-N425G) 14e4 d142 NetXtreme-E P425D BCM57504 4x25G SFP28 PCIE + 1590 0420 HPE Ethernet 25/50Gb 2-port 6310C Adapter 1752 BCM57502 NetXtreme-E 10Gb/25Gb/40Gb/50Gb Ethernet 1760 BCM57608 10Gb/25Gb/50Gb/100Gb/200Gb/400Gb Ethernet 14e4 d125 BCM57608 2x200G PCIe Ethernet NIC @@ -21908,11 +21952,13 @@ 0262 MT27710 [ConnectX-4 Lx Programmable] EN 0263 MT27710 [ConnectX-4 Lx Programmable Virtual Function] EN 0264 Innova-2 Flex Burn image - 0270 Spectrum-4L, Flash recovery mode + 0270 Spectrum-5 in Flash Recovery Mode 0271 Spectrum-4L, RMA - 0274 Spectrum-4C, Flash recovery mode + 0274 Spectrum-6 in Flash Recovery Mode 0275 Spectrum-4C RMA 0277 Spectrum-4TOR RMA + 0278 Quantum-4 in Flash Recovery Mode + 0279 Quantum-4 RMA 0281 NPS-600 Flash Recovery 0282 ArcusE Flash recovery 0283 ArcusE RMA @@ -22132,6 +22178,7 @@ d2f2 Quantum-2 NDR (400Gbps) switch d2f4 Quantum-3 d2f6 Quantum-3CPO + d2f8 Quantum-4 15b4 CCI/TRIAD 15b5 Cimetrics Inc 15b6 Texas Memory Systems Inc @@ -23338,7 +23385,8 @@ 1108 IPQ95xx/97xx PCIe Root Port 1109 QCN62xx/92xx Wireless Network Adapter 17cc NetChip Technology, Inc - 2280 USB 2.0 + 2280 NET2280 PCI to USB 2.0 Hi-Speed Peripheral Controller + 2282 NET2282 PCI to USB 2.0 Hi-Speed Peripheral Controller 17cd Cadence Design Systems, Inc. 17cf Z-Com, Inc. 17d3 Areca Technology Corp. @@ -23925,6 +23973,7 @@ 0013 SH7757 PCIe Switch [PS] 0014 uPD720201 USB 3.0 Host Controller 0015 uPD720202 USB 3.0 Host Controller + 4c52 9a72 LRSU9A72 2-Port USB 3.0 Exchange Adapter 001a SH7758 PCIe-PCI Bridge [PPB] 001b SH7758 PCIe End-Point [PBI] 001d SH7758 PCIe Switch [PS] @@ -24087,6 +24136,8 @@ 1942 ClearSpeed Technology plc e511 Advance X620 accelerator card e521 Advance e620 accelerator card +1945 MERA + 6200 PXI/PXIe measurement module 1947 C-guys, Inc. 4743 CG200 Dual SD/SDIO Host controller device 1948 Alpha Networks Inc. @@ -24184,6 +24235,7 @@ 7010 MPC8641 PCI Host Bridge 7011 MPC8641D PCI Host Bridge 7018 MPC8610 + 81c0 LS1046A PCI Express Bridge c006 MPC8308 1a56 1201 Bigfoot Killer E2100 Gigabit Ethernet Controller # PCIe interface for emulator @@ -24773,6 +24825,7 @@ 1050 Virtio 1.0 GPU 1052 Virtio 1.0 input 1053 Virtio 1.0 socket + 1058 virtio-mem 105a Virtio file system 1110 Inter-VM shared memory 1af4 1100 QEMU Virtual Machine @@ -24797,6 +24850,7 @@ 0612 ASM1061/ASM1062 Serial ATA Controller 1849 0612 Motherboard 0622 ASM106x Serial ATA AHCI Controller + 4c52 9661 LRST9661 2-port M.2 SATA3(6Gb/s) Raid Adapter 0624 ASM106x SATA/RAID Controller 0625 106x SATA/RAID Controller 1040 ASM1040 SuperSpeed USB Host Controller @@ -24818,6 +24872,7 @@ 1187 ASM1187e 7-Port PCIe x1 Gen2 Packet Switch 118f ASM1187e 7-Port PCIe x1 Gen2 Packet Switch 1242 ASM1142 USB 3.1 Host Controller + 4c52 9a42 LRSU9A42 2-Port Type-A Exchange Adapter 1343 ASM1143 USB 3.1 Host Controller 1806 ASM1806 4-Port PCIe x2 Gen2 Packet Switch 1812 ASM1812 6-Port PCIe x4 Gen2 Packet Switch @@ -24907,10 +24962,13 @@ 1028 2113 BOSS-N1 Modular 1028 2151 BOSS-N1 Modular ET 1028 2196 ROR-N1 + 1028 2286 BOSS-N1 DC-MHS + 1028 2287 BOSS-N1 Modular 1b4b 2241 Santa Cruz NVMe Host Adapter 1b96 4000 WD_BLACK AN1500 NVMe SSD 1d49 0306 ThinkSystem M.2 NVMe 2-Bay RAID Enablement Kit 1d49 0307 ThinkSystem 7mm NVMe 2-Bay Rear RAID Enablement Kit + 4c52 9541 LRNV9541 2-port M.2 NVMe Raid Adapter 2b42 88W8997 2.4/5 GHz Dual-Band 2x2 Wi-Fi® 5 (802.11ac) + Bluetooth® 5.3 Solution 2b43 NXP 88W9098 Wi-Fi 6 (ax) MAC #1 2b44 NXP 88W9098 Wi-Fi 6 (ax) MAC #2 @@ -24919,6 +24977,7 @@ 9123 88SE9123 PCIe SATA 6.0 Gb/s controller dc93 600e DC-6xxe series SATA 6G controller 9125 88SE9125 PCIe SATA 6.0 Gb/s controller + 4c52 9615 LRST9615 4-port SATA3(6Gb/s) Exchange Adapter 9128 88SE9128 PCIe SATA 6 Gb/s RAID controller 9130 88SE9128 PCIe SATA 6 Gb/s RAID controller with HyperDuo 1043 8438 P8P67 Deluxe Motherboard @@ -24947,6 +25006,7 @@ 1d49 0303 ThinkSystem SE350 M.2 SATA 4-Bay Data RAID Mirroring Enablement Kit 1d49 0304 ThinkSystem M.2 SATA 2-Bay RAID Enablement Kit 1d49 0305 ThinkSystem 7mm SATA 2-Bay Rear RAID Enablement Kit + 4c52 9630 LRST9630 4-port SATA3(6Gb/s) Raid Adapter 9235 88SE9235 PCIe 2.0 x2 4-port SATA 6 Gb/s Controller 9445 88SE9445 PCIe 2.0 x4 4-Port SAS/SATA 6 Gbps RAID Controller 9480 88SE9480 SAS/SATA 6Gb/s RAID controller @@ -24960,6 +25020,21 @@ # 2xHDMI and 2xHD-SDI inputs e5f4 MPEG2 and H264 Encoder-Transcoder f1c4 Dual ASI-RX/TX-CI card +1b5e STAR-Dundee Ltd. + 0001 SpaceWire PCI Mk2 + 0002 SpaceWire PCIe Mk1 + 0003 SpaceWire cPCI Mk2 + 0004 SpaceWire PXI Recorder Mk1 + 0005 SpaceWire PXI Interface Mk1 + 0006 SpaceWire PXI Interface Mk1 with RMAP Target + 0008 SpaceWire PXI Router Mk1 + 000b SpaceWire PXI Interface Mk2 + 000c SpaceWire PXI Interface Mk2 with RMAP Target + 000d SpaceWire PXI Router Mk2 + 000e SpaceWire PXI Recorder Mk2 + 0100 STAR-Ultra PCIe + 0102 STAR-Ultra Single-Lane Router + 0200 SpaceWire PCIe Mk2 1b61 Byd Precision Manufacture Co.,Ltd 1b66 DELTACAST 0007 DELTA-3G-elp-d @@ -25121,11 +25196,14 @@ 1bb1 0179 Nytro 5360S - E3.S # Nytro 5360S (Rocinante Single Port) TCG - E3.S 1bb1 0180 Nytro 5360S TCG - E3.S +# Nytro 5060H (Rocinante High Performance) non-SED + 1bb1 0181 Nytro 5060H 1bb1 01a1 Nytro XP7102 5012 FireCuda/IronWolf 510 SSD 5013 BarraCuda Q5 NVMe SSD (DRAM-less) 5016 FireCuda 520/IronWolf 525 SSD 5018 FireCuda 530 SSD + 5019 BarraCuda PCIe SSD (DRAM-less) # 2TB 5021 FireCuda 520 SSD # 1TB @@ -25315,6 +25393,7 @@ 0023 Ultrastar SN200 Series NVMe SSD 1c58 8823 Ultrastar Memory (ME200) 1c5c SK hynix + 1069 PCB01 NVMe Solid State Drive 1282 PC300 NVMe Solid State Drive 128GB 1283 PC300 NVMe Solid State Drive 256GB 1284 PC300 NVMe Solid State Drive 512GB @@ -25525,6 +25604,7 @@ 5762 FALCON, GAMMIX S41, SPECTRIX S40G NVMe SSD (DRAM-less) 5763 XPG GAMMIX S5 NVMe SSD (DRAM-less) 5766 XPG GAMMIXS1 1L, XPG GAMMIX S5, LEGEND 710 / 740, SWORDFISH NVMe SSD (DRAM-less) + 5772 LEGEND 850 LITE NVMe SSD (DRAM-less) 612a LEGEND 750 NVMe SSD (DRAM-less) 613a ATOM 50, LEGEND 840 NVMe SSD (DRAM-less) 621a LEGEND 850 NVMe SSD (DRAM-less) @@ -25565,6 +25645,7 @@ 6304 AM630 PCIe 4.0 NVMe SSD 1024GB 6a02 AM6A0 PCIe 4.0 NVMe SSD 256GB 6a03 RPETJ512MKP1QDQ PCIe 4.0 NVMe SSD 512GB (DRAM-less) + 6a13 RPJYJ512MKN1QWQ PCIe 4.0 NVMe SSD 512GB (DRAM-less) 6a14 RPEYJ1T24MKN2QWY PCIe 4.0 NVMe SSD 1024GB (DRAM-less) 8030 NVMe SSD Controller UH8X2X/UH7X2X series 1cc4 1122 NVMe SSD UH812a U.2 1.92TB @@ -25662,7 +25743,7 @@ 071a KX-5000/KX-6000/KX-6000G/KH-40000 PCI Express Root Port 071b KX-5000/KX-6000/KX-6000G/KH-40000/KX-7000 PCI Express Root Port 071c KX-5000/KX-6000/KX-6000G/KH-40000/KX-7000 PCI Express Root Port - 071d KX-5000/KX-6000/KX-6000G/KH-40000 PCI Express Root Port + 071d KX-5000/KX-6000/KX-6000G/KH-40000/KX-7000 PCI Express Root Port 071e KX-5000/KX-6000/KX-6000G/KH-40000/KX-7000 PCI Express Root Port 071f ZX-200 Upstream Port of PCI Express Switch 0720 ZX-200 PCIE RC6 controller @@ -25851,6 +25932,9 @@ 1024 AR-TK242 [2x10GbE Packet Capture Device] 1025 AR-TK242-FX2 [2x100GbE Gen5 Packet Capture Device] 1026 AR-TK242-FX2 [1x200GbE Gen5 Packet Capture Device] + 1027 AR-P2P-DBG [P2P Debug Function] + 1028 AR-P2P-ATR [P2P Actor Function] + 1029 AR-P2P-UTL [P2P Utility Function] 4200 A5PL-E1-10GETI [10 GbE Ethernet Traffic Instrument] 1d72 Xiaomi 1d78 DERA Storage @@ -25979,6 +26063,7 @@ 1062 Lexar NM710 NVME SSD 1160 FORESEE P900 BGA NVMe SSD (DRAM-less) 1202 Lexar NM610 PRO NVME SSD (DRAM-less) + 12e4 ORCA 4836 Series eSSD 1602 Lexar NM790 NVME SSD (DRAM-less) 1d97 Lexar NM620 NVME SSD (DRAM-less) 2263 SM2263EN/SM2263XT-based OEM NVME SSD (DRAM-less) @@ -26058,6 +26143,9 @@ 1dbe 2006 Dongting-N2 DC SSD U.2 7680GB 1dbe 3001 Donghu-Z2 DC ZNS SSD U.2 4000GB 1dbe 3002 Donghu-Z2 DC ZNS SSD U.2 8000GB + 5666 NVMe SSD Controller IG5666 + 5668 NVMe SSD Controller IG5668 + 5669 NVMe SSD Controller IG5669 [Tacoma] 1dbf Guizhou Huaxintong Semiconductor Technology Co., Ltd 0401 StarDragon4800 PCI Express Root Port 1dc2 Alco Digital Devices Limited @@ -26410,6 +26498,7 @@ 1df8 d100 M.2 NVMe SSD 1df8 d201 M.2 NVMe SSD 1df8 d600 M.2 NVMe SSD +1dfa Astera Labs, Inc. 1dfc JSC NT-COM 1181 TDM 8 Port E1/T1/J1 Adapter 1e0d SambaNova Systems, Inc @@ -26595,10 +26684,12 @@ 1e3b 0069 Enterprise NVMe SSD U.2 3.20TB (R5301D) 1e3b 006c Enterprise NVMe SSD U.2 1.92TB (R5101) 1e3b 006d Enterprise NVMe SSD U.2 1.60TB (J5301) - 1e3b 00b9 Enterprise NVMe SSD U.2 QDP 25.60TB (R5300) - 1e3b 00be Enterprise NVMe SSD U.2 QDP 30.72TB (R5100) - 1e3b 00c1 Enterprise NVMe SSD U.2 QDP 25.60TB (R5300D) - 1e3b 00c4 Enterprise NVMe SSD U.2 QDP 30.72TB (R5100D) + 1e3b 00b9 Enterprise NVMe SSD U.2 ODP 25.60TB (R5301)/(J5301) + 1e3b 00be Enterprise NVMe SSD U.2 ODP 30.72TB (R5101)/(J5101) + 1e3b 00c1 Enterprise NVMe SSD U.2 ODP 25.60TB (R5301D)/(J5301D) + 1e3b 00c4 Enterprise NVMe SSD U.2 ODP 30.72TB (R5101D)/(J5101D) + 1e3b 00c7 Enterprise NVMe SSD U.2 ODP 25.60TB (J5300) + 1e3b 00c8 Enterprise NVMe SSD U.2 ODP 30.72TB (J5100) 1e3b 00c9 Enterprise NVMe SSD U.2 ODP 15.36TB (J5001) 1e3b 00ca Enterprise NVMe SSD U.2 ODP 3.84TB (J5102) 1e3b 00cb Enterprise NVMe SSD U.2 ODP 7.68TB (J5102) @@ -26609,12 +26700,16 @@ 1e3b 00dc Enterprise NVMe SSD U.2 ODP 30.72TB with SAMSUNG 32GB DRAM (J5001) 1e3b 00dd Enterprise NVMe SSD U.2 ODP 30.72TB with MT 32GB DRAM(J5001) 1e3b 00de Enterprise NVMe SSD U.2 ODP 15.36TB with SK 16GB DRAM(J5001D) - 1e3b 00df Enterprise NVMe SSD U.2 ODP 30.72TB with SAMSUNG 32GB DRAM(J5001D) + 1e3b 00df Enterprise NVMe SSD U.2 ODP 30.72TB with SAMSUNG 32GB DRAM(J5001) 1e3b 00e7 Enterprise NVMe SSD U.2 ODP 30.72TB with MT 32GB DRAM(J5001D) 1e3b 00e8 Enterprise NVMe SSD U.2 QDP 3.20TB (J5301) 1e3b 00e9 Enterprise NVMe SSD U.2 ODP 6.40TB (J5301) 1e3b 00ea Enterprise NVMe SSD U.2 QDP 3.20TB (J5301D) 1e3b 00eb Enterprise NVMe SSD U.2 ODP 6.40TB (J5301D) + 1e3b 00ec Enterprise NVMe SSD U.2 ODP 30.72TB with MT 32GB DRAM(J5101) + 1e3b 00ed Enterprise NVMe SSD U.2 ODP 30.72TB with MT 32GB DRAM(R5101) + 1e3b 00ee Enterprise NVMe SSD U.2 ODP 15.36B with SK 16GB DRAM(J5101) + 1e3b 00ef Enterprise NVMe SSD U.2 ODP 12.80TB with SK 16GB DRAM(J5301) 1e3b 00f0 Enterprise NVMe SSD U.2 0.40TB (X2900) 1e3b 00f1 Enterprise NVMe SSD U.2 0.80TB (X2900) 1e3b 00f2 Enterprise NVMe SSD U.2 1.60TB (X2900) @@ -26786,6 +26881,7 @@ 1001 Video Accelerator 1eb4 Quantum Nebula Microelectronics Technology Co.,Ltd. 3401 SSD Contoller +1eb6 Wuxi Stars Microsystem Technology Co., Ltd 1eb9 Senscomm Semiconductor, Inc 2020 SCM2625 Wi-Fi6 Network Adapter 1ebd EMERGETECH Company Ltd. @@ -26829,12 +26925,14 @@ 1eca Lightmatter 0000 Envise-B 1ed0 Hosin Global Electronics + 2283 Patriot P300 NVMe SSD (DRAM-less) 1ed2 FuriosaAI, Inc. 0000 Warboy 1111 RNGD 0000 1111 RNGD-S 0000 2222 RNGD VF 0000 3333 RNGD-S VF + 2222 RNGD-S 1ed3 Yeston 1ed5 Moore Threads Technology Co.,Ltd 0100 MTT S10 @@ -26918,6 +27016,7 @@ 1ef6 GrAI Matter Labs 1ef7 Shenzhen Gunnir Technology Development Co., Ltd 1efb Flexxon Pte Ltd +1eff Rebellions Inc. 1f02 Beijing Dayu Technology 1f03 Shenzhen Shichuangyi Electronics Co., Ltd 1202 MAP1202-Based NVMe SSD (DRAM-less) @@ -26947,7 +27046,27 @@ 1a01 M16104 Family Virtual Function 1f0f 0001 M16104 Family Virtual Function 2022 D1055AS PCI Express Switch Upstream Port + 3403 M18110 Family + 3404 M18110 Lx Family + 3405 M18110 Family BASE-T + 3406 M18110 Lx Family BASE-T + 3407 M18110 Family OCP + 3408 M18110 Lx Family OCP + 3409 M18110 Family BASE-T OCP + 340a M18110 Lx Family BASE-T OCP + 340b M18120 Family + 340c M18120 Lx Family + 340d M18120 Family BASE-T + 340e M18120 Lx Family BASE-T + 340f M18120 Family OCP + 3410 M18120 Lx Family OCP + 3411 M18120 Family BASE-T OCP + 3412 M18120 Lx Family BASE-T OCP + 3413 M18100 Family Virtual Function 9088 D1055AS PCI Express Switch Downstream Port +1f16 XConn Technologies +# XConn XC50256 CXL2.0/PCIe5.0 switch + c500 XC50256 1f17 Zettastone Technology 1f24 xFusion Digital Technologies Co., Ltd. 1058 EP500/EP600 NVMe SSD @@ -27148,6 +27267,25 @@ 1fe4 0077 Enterprise NVMe SSD U.2 6.40TB(HP630) 1fe4 0078 Enterprise NVMe SSD U.2 3.20TB(HP630) 1fe9 MemryX +# LinkData Technology (Tianjin) Co., LTD +1ff2 Linkdata + 10a1 NIC1160 Ethernet Controller Family + 1ff2 0c11 10GE Ethernet Adapter 1160-2X + 10a2 NIC1160 Ethernet Controller Virtual Function Family + 20a1 IOC2110 Storage Controller + 1ff2 0a11 2120-16i SATA3/SAS3 HBA Adapter + 1ff2 0a12 2120-8i SATA3/SAS3 HBA Adapter + 20a2 IOC2250 Storage Controller + 1ff2 0a21 2230-18i Tri-mode HBA Adapter + 1ff2 0a22 2230-10i Tri-mode HBA Adapter + 1ff2 0a23 2230-16i Tri-mode HBA Adapter + 1ff2 0a24 2230-8i Tri-mode HBA Adapter + 1ff2 0a28 2233-16i Tri-mode HBA Adapter + 30a2 ROC3250 Storage Controller + 1ff2 0b21 3260-18i Tri-mode RAID Adapter + 1ff2 0b22 3260-10i Tri-mode RAID Adapter + 1ff2 0b23 3260-16i Tri-mode RAID Adapter + 1ff2 0b24 3260-8i Tri-mode RAID Adapter 1ff4 DEEPX Co., Ltd. 0000 DX_M1 0001 DX_M1A @@ -27600,10 +27738,8 @@ 4c52 LR-LINK 1001 Smart Network Adapter 4c52 a008 LREG1008PT Single-port 1Gb Smart Ethernet Network Adapter - 4c52 a009 LREG1009PT Single-port 2.5Gb Smart Ethernet Network Adapter 1002 Smart Network Adapter 4c52 a006 LREG1006PT Single-port 1.2Gb Network Security Isolation Adapter - 4c52 a007 LREG1007PT Quad-port 10Gb Smart Ethernet Network Adapter 1003 Smart Network Adapter 1004 Smart Network Adapter 4c52 b010 LREG1010PF Single-port 10Gb FPGA Network Security Isolation Adapter @@ -27667,6 +27803,9 @@ 50b2 TerraTec Electronic GmbH 50ce System-on-Chip Engineering S.L. 0001 RELY-MIL-XMC-TSN-SWITCH + 0100 XMC_AV-Dual-ETH + 0101 XMC_AV-ETSN + 0102 XMC_AV-AFDX 5136 S S Technologies 5143 Qualcomm Inc 5145 Ensoniq (Old) @@ -29738,7 +29877,12 @@ 125d Ethernet Controller I226-IT 12d1 Ethernet Controller E830-CC for backplane 12d2 Ethernet Controller E830-CC for QSFP + 8086 0002 Ethernet Network Adapter E830-C-Q2 for OCP 3.0 + 8086 0004 Ethernet Network Adapter E830-CC-Q1 for OCP 3.0 12d3 Ethernet Controller E830-CC for SFP + 8086 0001 Ethernet Network Adapter E830-XXV-2 for OCP 3.0 + 8086 0003 Ethernet Network Adapter E830-XXV-2 + 8086 0004 Ethernet Network Adapter E830-XXV-4 for OCP 3.0 12d4 Ethernet Controller E830-CC for SFP-DD 12d5 Ethernet Controller E830-C for backplane 12d8 Ethernet Controller E830-C for QSFP @@ -30368,6 +30512,7 @@ 15fc Ethernet Connection (13) I219-V 15ff Ethernet Controller X710 for 10GBASE-T 1014 0000 PCIe3 4-port 10GbE Base-T Adapter + 108e 7b1f Quad Port 10GBase-T Adapter - CP 1137 0000 X710TLG GbE RJ45 PCIe NIC 1137 02c1 X710T2LG 2x10 GbE RJ45 PCIe NIC 1137 02c2 X710T4LG 4x10 GbE RJ45 PCIe NIC @@ -34334,6 +34479,7 @@ 37d9 X722 Hyper-V Virtual Function 3882 Ice Lake LPC Controller 38a4 Ice Lake SPI Controller + 38c8 Ice Lake-LP Smart Sound Technology Audio Controller 38e0 Ice Lake Management Engine Interface 3a00 82801JD/DO (ICH10 Family) 4-port SATA IDE Controller 3a02 82801JD/DO (ICH10 Family) SATA AHCI Controller @@ -34932,6 +35078,7 @@ 4641 12th Gen Core Processor Host Bridge/DRAM Registers 1028 0b10 Precision 3571 464d 12th Gen Core Processor PCI Express x4 Controller #0 + 464e Alder Lake-N Thunderbolt 4 USB Controller 464f 12th Gen Core Processor Gaussian & Neural Accelerator 1028 0b10 Precision 3571 4650 12th Gen Core Processor Host Bridge @@ -34978,8 +35125,13 @@ 4908 DG1 [Iris Xe Graphics] 4909 DG1 [Iris Xe MAX 100] 4940 4xxx Series QAT - 4942 4xxx Series QAT - 4944 4xxx Series QAT + 4941 4xxx Series QAT Virtual Function + 4942 401xx Series QAT + 4943 401xx Series QAT Virtual Function + 4944 402xx Series QAT + 4945 402xx Series QAT Virtual Function + 4946 420xx Series QAT + 4947 420xx Series QAT Virtual Function 4b00 Elkhart Lake eSPI Controller 4b23 Elkhart Lake SMBus Controller 4b24 Elkhart Lake SPI (Flash) Controller @@ -35107,6 +35259,7 @@ 51b0 Alder Lake PCI Express Root Port #9 51b1 Alder Lake PCI Express x1 Root Port #10 51bb Alder Lake-P PCH PCIe Root Port #4 + 51bd Alder Lake-P PCH PCIe Root Port #6 51bf Alder Lake PCH-P PCI Express Root Port #9 51c5 Alder Lake-P Serial IO I2C Controller #0 51c6 Alder Lake-P Serial IO I2C Controller #1 @@ -35166,7 +35319,15 @@ 8086 0001 EtherExpress PRO/100 Server Ethernet Adapter 530d 80310 (IOP) IO Processor 5481 Alder Lake-N PCH eSPI Controller + 54a3 Alder Lake-N SMBus + 54a4 Alder Lake-N SPI (flash) Controller + 54a8 Alder Lake-N Serial IO UART Host Controller + 54b0 Alder Lake-N PCI Express Root Port #9 + 54b1 Alder Lake-N PCI Express Root Port #10 + 54b2 Alder Lake-N PCI Express Root Port #11 + 54b3 Alder Lake-N PCI Express Root Port #12 54c8 Alder Lake-N PCH High Definition Audio Controller + 54d3 Alder Lake-N SATA AHCI Controller 54e0 Alder Lake-N PCH HECI Controller 54ed Alder Lake-N PCH USB 3.2 xHCI Host Controller 54ef Alder Lake-N PCH Shared SRAM @@ -35205,7 +35366,7 @@ 56bf DG2 [Arc Graphics A580E] 56c0 ATS-M [Data Center GPU Flex 170] 56c1 ATS-M [Data Center GPU Flex 140] - 56c2 ATS-M [Data Center GPU Flex 170G] + 56c2 ATS-M [Data Center GPU Flex 170V] 5780 Thunderbolt 80/120G Bridge [Barlow Ridge Host 80G 2023] 5781 Thunderbolt 80/120G NHI [Barlow Ridge Host 80G 2023] 5782 Thunderbolt 80/120G USB Controller [Barlow Ridge Host 80G 2023] @@ -35219,9 +35380,12 @@ 579e Ethernet Connection E825-C for SFP 57a4 Thunderbolt Bridge [Barlow Ridge Hub 40G 2023] 57a5 Thunderbolt USB Controller [Barlow Ridge Hub 40G 2023] + 57ae Ethernet Controller E610 Backplane + 57af Ethernet Controller E610 SFP 57b0 Ethernet Controller E610 10GBASE T 57b1 Ethernet Controller E610 2.5GBASE T 8086 0000 Ethernet Converged Network Adapter E610 + 57b2 Ethernet Controller E610 SGMII 5845 QEMU NVM Express Controller 1af4 1100 QEMU Virtual Machine 5900 Xeon E3-1200 v6/7th Gen Core Processor Host Bridge/DRAM Registers @@ -35600,6 +35764,7 @@ 7a27 Raptor Lake-S PCH Shared SRAM 7a30 Raptor Lake PCI Express Root Port #9 7a38 Raptor Lake PCI Express Root Port #1 + 7a3a Raptor Point-S PCH - PCI Express Root Port 3 7a3b Raptor Lake PCI Express Root Port #4 7a40 Raptor Lake PCI Express Root Port #17 7a44 Raptor Lake PCI Express Root Port #21 @@ -35646,7 +35811,10 @@ 8086 0094 Wi-Fi 6 AX201 160MHz 7afc Alder Lake-S PCH Serial IO I2C Controller #4 7afd Alder Lake-S PCH Serial IO I2C Controller #5 + 7d03 Meteor Lake-P Dynamic Tuning Technology 7d0b Volume Management Device NVMe RAID Controller Intel Corporation + 7d0d Meteor Lake-P Platform Monitoring Technology + 7d19 Meteor Lake IPU 7d1d Meteor Lake NPU 7d40 Meteor Lake-M [Intel Graphics] 7d41 Arrow Lake-U [Intel Graphics] @@ -35668,10 +35836,15 @@ 7e30 Meteor Lake-P Serial IO SPI Controller #1 7e40 Meteor Lake PCH CNVi WiFi 8086 0094 Wi-Fi 6E AX211 160MHz +# Refer from Intel Meteor Lake EDS (doc#640228) under its "Device IDs" section. + 7e45 Meteor Lake-P Integrated Sensor Hub 7e46 Meteor Lake-P Serial IO SPI Controller #2 + 7e4c Meteor Lake-P Gaussian & Neural-Network Accelerator 7e50 Meteor Lake-P Serial IO I2C Controller #4 7e51 Meteor Lake-P Serial IO I2C Controller #5 7e52 Meteor Lake-P Serial IO UART Controller #2 + 7e70 Meteor Lake-P CSME HECI #1 + 7e73 Meteor Lake-P Keyboard and Text (KT) Redirection 7e78 Meteor Lake-P Serial IO I2C Controller #0 7e79 Meteor Lake-P Serial IO I2C Controller #1 7e7a Meteor Lake-P Serial IO I2C Controller #2 @@ -36758,6 +36931,7 @@ a72f Raptor Lake-P Thunderbolt 4 PCI Express Root Port #2 a73e Raptor Lake-P Thunderbolt 4 NHI #0 1028 0c06 Precision 3580 + a740 Raptor Lake-S 8+12 - Host Bridge/DRAM Controller a74d Raptor Lake PCIe 4.0 Graphics Port a74f GNA Scoring Accelerator module 1028 0c06 Precision 3580 @@ -36851,6 +37025,11 @@ d156 Core Processor Semaphore and Scratchpad Registers d157 Core Processor System Control and Status Registers d158 Core Processor Miscellaneous Registers + e202 Battlemage G21 [Intel Graphics] + e20b Battlemage G21 [Intel Graphics] + e20c Battlemage G21 [Intel Graphics] + e20d Battlemage G21 [Intel Graphics] + e212 Battlemage G21 [Intel Graphics] f1a5 SSD 600P Series 8086 390a SSDPEKKW256G7 256GB f1a6 SSD DC P4101/Pro 7600p/760p/E 6100p Series @@ -36921,7 +37100,7 @@ 0119 WX1860-LC Gigabit Ethernet Controller Virtual Function 011a WX1860A1 Gigabit Ethernet Controller Virtual Function 011b WX1860AL1 Gigabit Ethernet Controller Virtual Function - 1000 Ethernet Controller RP1000 Virtual Function for 10GbE SFP+ + 1000 Ethernet Controller SP1000A Virtual Function for 10GbE SFP+ 1001 Ethernet Controller SP1000A for 10GbE SFP+ 1bd4 0084 Ethernet Controller SP1000A for 10GbE SFP+(lldp) 1bd4 0085 Ethernet Controller SP1000A for 10GBASE-T @@ -36931,7 +37110,7 @@ 8088 0000 Ethernet Network Adaptor RP1000 for 10GbE SFP+ 8088 0300 Ethernet Network Adaptor RP1000-A03 for 10GbE SFP+ 8088 0400 Ethernet Network Adaptor RP1000-A04 for 10GbE SFP+ - 2000 Ethernet Controller RP2000 Virtual Function for 10GbE SFP+ + 2000 Ethernet Controller WX1820AL Virtual Function for 10GbE SFP+ 2001 Ethernet Controller WX1820AL for 10GbE SFP+ 8088 2000 Ethernet Network Adaptor RP2000 for 10GbE SFP+ 8088 2300 Ethernet Network Adaptor RP2000-A03 for 10GbE SFP+ @@ -36943,7 +37122,17 @@ 8384 SigmaTel 8401 TRENDware International Inc. 8510 Sietium Semiconductor Co., Ltd. - 0201 GenBu02 [GB2062-PCIe-C0] + 0201 GenBu02 Series GPU + 8510 0001 GB2062-PUB-LPDDR + 8510 0002 GB2062-PCIe-C0 + 8510 0003 GB2062-PCIe-C41 + 8510 0004 GB2062-PCIe-HIEILP4 + 8510 0005 CQ2040-PCIe-C21 + 8510 0007 GB2062-PCIe-C40 + 8510 0008 CQ2040-MXM-M60 + 8510 0009 GB2062-PCIe-C20 + 8510 000c CQ2040-PUB + 8510 0201 GB2062-PUB-DDR # nee ScaleMP 8686 SAP 1010 vSMP Foundation controller [vSMP CTL] @@ -37316,6 +37505,9 @@ 103c 1101 Smart Array P416ie-m SR G10 105b 1211 HBA 8238-16i 105b 1321 HBA 8242-24i + 1137 02f8 24G TriMode M1 RAID 4GB FBWC 32D + 1137 02f9 24G TriMode M1 RAID 4GB FBWC 16D + 1137 02fa 24G TriMode M1 HBA 16D 13fe 8312 SKY-9200 MIC-8312BridgeB 152d 8a22 QS-8204-8i 152d 8a23 QS-8238-16i From 3cd90cb66d96b7604b8c5fde6f061e68eddbfc82 Mon Sep 17 00:00:00 2001 From: Baptiste Daroussin Date: Mon, 19 Aug 2024 18:11:41 +0200 Subject: [PATCH 035/145] usb_vendors: update to 2024.07.04 --- share/misc/usb_vendors | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/share/misc/usb_vendors b/share/misc/usb_vendors index a1e3ea4acd1d..41b367d1a0d6 100644 --- a/share/misc/usb_vendors +++ b/share/misc/usb_vendors @@ -9,8 +9,8 @@ # The latest version can be obtained from # http://www.linux-usb.org/usb.ids # -# Version: 2024.03.18 -# Date: 2024-03-18 20:34:02 +# Version: 2024.07.04 +# Date: 2024-07-04 20:34:02 # # Vendors, devices and interfaces. Please keep sorted. @@ -2400,6 +2400,7 @@ 02e3 Xbox One Elite Controller 02e6 Xbox Wireless Adapter for Windows 02ea Xbox One Controller + 02f3 Xbox One Chatpad 02fd Xbox One S Controller [Bluetooth] 02fe Xbox Wireless Adapter for Windows 0306 Surface Pro 7 SD Card Reader @@ -4992,7 +4993,7 @@ 0a28 INDI AV-IN Device 1301 Network Controller 1302 i3 Gateway - 1303 3 Micro Module + 1303 i3 Micro Module 1304 i3 Module 1305 i3 Multi Sensing Module 04c1 U.S. Robotics (3Com) @@ -6433,6 +6434,7 @@ 2060 PT-E550W P-touch Label Printer 2061 PT-P700 P-touch Label Printer 2064 PT-P700 P-touch Label Printer RemovableDisk + 2065 PT-P750W P-Touch Label Writer 2074 PT-D600 P-touch Label Printer 209b QL-800 Label Printer 209c QL-810W Label Printer @@ -7439,6 +7441,7 @@ 03dd PTH-460 [Intuos Pro BT (S)] tablet 03ec DTH134 [DTH134] touchscreen 03ed DTC121 [DTC121] touchscreen + 03f0 DTH135 [Movink 13] 0400 PenPartner 4x5 4001 TPC4001 4004 TPC4004 @@ -9049,6 +9052,7 @@ 0752 micros Reader 0760 USB 2.0 Card Reader/Writer 0761 Genesys Mass Storage Device + 0769 SPR2801S [Lightspeeur 2801] 0780 USBFS DFU Adapter 07a0 Pen Flash 0880 Wasp (SL-6612) @@ -10898,7 +10902,7 @@ 0056 Agfa AP1100 Photo Printer 005d Mobile Mass Storage 005f Laser Pro LL [MFPrinter] - 0062 XG-76NA 802.11bg + 0062 XG-76NA / XG-760N 802.11b/g Wireless adapter 0078 Laser Pro Monochrome MFP 079d Alfadata Computer Corp. 0201 GamePort Adapter @@ -11169,6 +11173,7 @@ 1228 MPEG-2 Capture Device (M038) 1830 AVerTV Volar Video Capture (H830) 1871 TD310 DVB-T/T2/C dongle + 2553 Live Gamer Ultra 2.1 3835 AVerTV Volar Green HD (A835B) 850a AverTV Volar Black HD (A850) 850b AverTV Red HD+ (A850T) @@ -11180,6 +11185,7 @@ b300 A300 DVB-T TV receiver b800 MR800 FM Radio c039 DVD EZMaker 7 + d553 Live Gamer Ultra Pro-RGB e880 MPEG-2 Capture Device (E880) e882 MPEG-2 Capture Device (E882) 07cb Kingmax Technology, Inc. @@ -12286,7 +12292,7 @@ 0a0b WLU5053 802.11abgn Wireless Module [Broadcom BCM43236B] 0a13 AX88179 Gigabit Ethernet [Toshiba] 0b05 PX1220E-1G25 External hard drive - 0b09 PX1396E-3T01 External hard drive + 0b09 PX139xE 3.5 External HDD 0b1a STOR.E ALU 2S 1300 Wireless Broadband (CDMA EV-DO) SM-Bus Minicard Status Port 1301 Wireless Broadband (CDMA EV-DO) Minicard Status Port @@ -12420,6 +12426,7 @@ 010f nanoKONTROL studio controller 0117 nanoKONTROL2 MIDI Controller 012f SQ-1 + 0154 NTS-1 digital kit mkII 0203 KRONOS 0f03 K-Series K61P MIDI studio controller 0945 Pasco Scientific @@ -13004,6 +13011,7 @@ 5803 BCM5880 Secure Applications Processor with secure keyboard 5804 BCM5880 Secure Applications Processor with fingerprint swipe sensor 5832 BCM5880 Secure Applications Processor Smartcard reader + 5843 BCM58200 ControlVault 3 (FingerPrint sensor + Contacted SmartCard) 6300 Pirelli Remote NDIS Device 6410 BCM20703A1 Bluetooth 4.1 + LE bd11 BCM4320 802.11bg Wireless Adapter @@ -13020,11 +13028,14 @@ 0009 LP2844 Printer 0027 ZTC LP2844-Z-200dpi 0050 P120i / WM120i + 0062 GK420d Label Printer + 0065 ZM400 Label Printer 0080 GK420d Label Printer 0081 GK420t Label Printer 0084 GX420d Desktop Label Printer 008b HC100 wristbands Printer 008c ZP 450 Printer + 00a1 TLP2824 Plus 00d1 GC420d Label Printer 0110 ZD500 Desktop Label Printer 011c ZD410 Direct Thermal Label Printer @@ -13035,6 +13046,7 @@ 0010 MPMan MP-F40 MP3 Player 0a66 ClearCube Technology 0a67 Medeli Electronics Co., Ltd + ffff LCS Audio 0a68 Comaide Corp. 0a69 Chroma ate, Inc. 0a6b Green House Co., Ltd @@ -13246,10 +13258,11 @@ 0ac9 Micro Solutions, Inc. 0000 Backpack CD-ReWriter 0001 BACKPACK 2 Cable - 0010 BACKPACK + 0010 BACKPACK CD Drive 0011 Backpack 40GB Hard Drive 0110 BACKPACK 0111 BackPack + 10ff BACKPACK 1234 BACKPACK 0aca OPEN Networks Ltd 1060 OPEN NT1 Plus II @@ -13308,7 +13321,7 @@ 3102 MemoryStick Card Reader 3201 MMC/SD+MemoryStick Card Reader 3216 HS Card Reader - 3260 7-in-1 Card Reader + 3260 ND3260 7-in-1 Card Reader 5010 ND5010 Card Reader 0af0 Option 5000 UMTS Card @@ -13382,6 +13395,7 @@ 17a0 Xonar U3 sound card 17a1 Eee Note EA800 (mass storage mode) 17ab USB-N13 802.11n Network Adapter (rev. B1) [Realtek RTL8192CU] + 17b5 Broadcom BCM20702A0 Bluetooth 17ba N10 Nano 802.11n Network Adapter [Realtek RTL8192CU] 17c2 ROG Spitfire 17c7 WL-330NUL From e06022e1bfc263e8b1393c7b948707a250f16f83 Mon Sep 17 00:00:00 2001 From: Colin Percival Date: Sun, 18 Aug 2024 01:59:18 +0000 Subject: [PATCH 036/145] Makefile.ec2: Add missing CLEANFILES entry Without this, "make clean ec2ami" won't build a new AMI. MFC after: 3 days Sponsored by: Amazon --- release/Makefile.ec2 | 1 + 1 file changed, 1 insertion(+) diff --git a/release/Makefile.ec2 b/release/Makefile.ec2 index 8f5f6f205779..4363aac21369 100644 --- a/release/Makefile.ec2 +++ b/release/Makefile.ec2 @@ -60,6 +60,7 @@ cw-ec2-portinstall: SSMOPTS_${_FL}_${_FS}= --ssm-name ${SSMPREFIX}/${TARGET_ARCH:S/aarch64/arm64/}/${_FL}/${_FS}/${REVISION}/${BRANCH} .endif EC2AMILIST+= ec2ami-${_FL}-${_FS} +CLEANFILES+= ec2ami-${_FL}-${_FS} ec2ami-${_FL}-${_FS}: cw-ec2-${_FL}-${_FS} ${CW_EC2_PORTINSTALL} .if !defined(AWSKEYFILE) || !exists(${AWSKEYFILE}) @echo "--------------------------------------------------------------" From 2dac89aee3304dd6eda9b267a0ad1cc6621a7094 Mon Sep 17 00:00:00 2001 From: Colin Percival Date: Sun, 18 Aug 2024 02:00:31 +0000 Subject: [PATCH 037/145] EC2: Bump AMI size to 8 GB 8 GB root disk images make FreeBSD/EC2 boot much faster than 6 GB root disk images. MFC after: 2 days Sponsored by: Amazon --- release/tools/ec2.conf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/release/tools/ec2.conf b/release/tools/ec2.conf index 989e8a05a318..602216d3c2d4 100644 --- a/release/tools/ec2.conf +++ b/release/tools/ec2.conf @@ -10,12 +10,12 @@ export VM_EXTRA_PACKAGES="${VM_EXTRA_PACKAGES} ebsnvme-id amazon-ssm-agent" # Services which should be enabled by default in rc.conf(5). export VM_RC_LIST="dev_aws_disk ntpd" -# Build with a 5.9 GB partition; the growfs rc.d script will expand +# Build with a 7.9 GB partition; the growfs rc.d script will expand # the partition to fill the root disk after the EC2 instance is launched. # Note that if this is set to G, we will end up with an GB disk # image since VMSIZE is the size of the filesystem partition, not the disk # which it resides within. -export VMSIZE=6000m +export VMSIZE=8000m # No swap space; it doesn't make sense to provision any as part of the disk # image when we could be launching onto a system with anywhere between 0.5 From c482d65cd187a5cc311ede3e328270bfcd014e73 Mon Sep 17 00:00:00 2001 From: Colin Percival Date: Sun, 18 Aug 2024 02:05:52 +0000 Subject: [PATCH 038/145] EC2: Drop UEFI-PREFERRED from AMI names Starting in September 2021 EC2 AMI names have included the boot method: "BIOS", "UEFI", or "UEFI-PREFERRED". The third option became available in June 2023 and allows AMIs to boot via UEFI on EC2 instance types which support that, but fall back to (much slower) BIOS booting on the instance types which don't support UEFI. Since UEFI-PREFERRED is basically a best-of-both-worlds option and is now the default, there's no point mentioning it in the AMI names. If for some reason an AMI is built with the boot method forced to BIOS or UEFI, that will still be included in the AMI name. This will not be MFCed, in case anyone has scripts which look at the AMI names on 13.x/14.x. Sponsored by: Amazon --- release/Makefile.ec2 | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/release/Makefile.ec2 b/release/Makefile.ec2 index 4363aac21369..27e6340dca2b 100644 --- a/release/Makefile.ec2 +++ b/release/Makefile.ec2 @@ -29,6 +29,11 @@ BOOTMODEOPT= --uefi .if ${AMIBOOTMETHOD} == "UEFI-PREFERRED" && ${TARGET_ARCH} == "amd64" BOOTMODEOPT= --uefi-preferred .endif +.if ${AMIBOOTMETHOD} == "UEFI-PREFERRED" +AMIBASENAME=${TYPE} ${REVISION}-${BRANCH}-${TARGET}${AMINAMESUFFIX} +.else +AMIBASENAME=${TYPE} ${REVISION}-${BRANCH}-${TARGET}${AMINAMESUFFIX} ${AMIBOOTMETHOD} +.endif CLEANFILES+= ec2ami @@ -83,7 +88,7 @@ ec2ami-${_FL}-${_FS}: cw-ec2-${_FL}-${_FS} ${CW_EC2_PORTINSTALL} /usr/local/bin/bsdec2-image-upload ${PUBLISH} ${PUBLICSNAP} \ ${EC2ARCH} ${SSMOPTS_${_FL}_${_FS}} ${BOOTMODEOPT} --sriov --ena \ ${.OBJDIR}/${EC2-${_FL:tu}${_FS:tu}IMAGE} \ - "${TYPE} ${REVISION}-${BRANCH}-${TARGET}${AMINAMESUFFIX} ${AMIBOOTMETHOD} ${_FL} ${_FS:tu}" \ + "${AMIBASENAME} ${_FL} ${_FS:tu}" \ "${TYPE}/${TARGET} ${GITBRANCH}@${GITREV}" \ ${AWSREGION} ${AWSBUCKET} ${AWSKEYFILE} \ ${EC2SNSTOPIC} ${EC2SNSREL} ${EC2SNSVERS} From 2a916499470ae35178ca218e68175478037c3b0a Mon Sep 17 00:00:00 2001 From: Wolfram Schneider Date: Mon, 19 Aug 2024 19:32:51 +0000 Subject: [PATCH 039/145] bsd-family-tree: shorter URL for FreeBSD manual pages --- share/misc/bsd-family-tree | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/share/misc/bsd-family-tree b/share/misc/bsd-family-tree index 955bb0f30e43..24de0a603731 100644 --- a/share/misc/bsd-family-tree +++ b/share/misc/bsd-family-tree @@ -934,7 +934,7 @@ FreeBSD Release Information URL: https://www.FreeBSD.org/releases/ Manual pages for FreeBSD and ports -URL: https://man.FreeBSD.org/cgi/man.cgi +URL: https://man.FreeBSD.org UNIX history graphing project URL: https://minnie.tuhs.org/Unix_History/index.html From 4132c4be4c0a4b80a4ef6f4b8ff0d8ac9a3b9939 Mon Sep 17 00:00:00 2001 From: Wolfram Schneider Date: Mon, 19 Aug 2024 19:40:34 +0000 Subject: [PATCH 040/145] bsd-family-tree: add FreeBSD Documentation Archive --- share/misc/bsd-family-tree | 3 +++ 1 file changed, 3 insertions(+) diff --git a/share/misc/bsd-family-tree b/share/misc/bsd-family-tree index 24de0a603731..4f9ee8d438d8 100644 --- a/share/misc/bsd-family-tree +++ b/share/misc/bsd-family-tree @@ -936,6 +936,9 @@ URL: https://www.FreeBSD.org/releases/ Manual pages for FreeBSD and ports URL: https://man.FreeBSD.org +FreeBSD Documentation Archive +URL: https://docs-archive.freebsd.org/doc/ + UNIX history graphing project URL: https://minnie.tuhs.org/Unix_History/index.html From 12a6257a96007222e5441d883709fca2a28febb5 Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Mon, 19 Aug 2024 13:53:27 +0100 Subject: [PATCH 041/145] sys/conf: Introduce NOSAN_CFLAGS and NOSAN_C To simplify disabling the kernel sanitizers in some files add NOSAN_CFLAGS and NOSAN_C variables. These are CFLAGS and NORMAL_C with the sanitizer flags removed. While here add MSAN_CFLAGS to simplify keeping KMSAN in kern_kcov.c Reviewed by: khng, brooks, imp, markj Sponsored by: Arm Ltd Differential Revision: https://reviews.freebsd.org/D45498 --- sys/conf/files | 10 +++++----- sys/conf/files.arm64 | 6 +++--- sys/conf/kern.mk | 2 ++ sys/conf/kern.post.mk | 6 +++--- sys/conf/kern.pre.mk | 9 +++++++-- sys/conf/kmod.mk | 4 ++-- sys/modules/linux64/Makefile | 3 +-- sys/modules/vmm/Makefile | 15 ++++++--------- 8 files changed, 29 insertions(+), 26 deletions(-) diff --git a/sys/conf/files b/sys/conf/files index df4c702540ae..6445f00e2801 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -3811,7 +3811,7 @@ kern/kern_idle.c standard kern/kern_intr.c standard kern/kern_jail.c standard kern/kern_kcov.c optional kcov \ - compile-with "${NORMAL_C:N-fsanitize*} ${NORMAL_C:M-fsanitize=kernel-memory}" + compile-with "${NOSAN_C} ${MSAN_CFLAGS}" kern/kern_khelp.c standard kern/kern_kthread.c standard kern/kern_ktr.c optional ktr @@ -3879,7 +3879,7 @@ kern/stack_protector.c standard \ kern/subr_acl_nfs4.c optional ufs_acl | zfs kern/subr_acl_posix1e.c optional ufs_acl kern/subr_asan.c optional kasan \ - compile-with "${NORMAL_C:N-fsanitize*:N-fstack-protector*}" + compile-with "${NOSAN_C:N-fstack-protector*}" kern/subr_autoconf.c standard kern/subr_blist.c standard kern/subr_boot.c standard @@ -3891,10 +3891,10 @@ kern/subr_clock.c standard kern/subr_compressor.c standard \ compile-with "${NORMAL_C} -I$S/contrib/zstd/lib/freebsd" kern/subr_coverage.c optional coverage \ - compile-with "${NORMAL_C:N-fsanitize*:N-fno-sanitize*}" + compile-with "${NOSAN_C}" kern/subr_counter.c standard kern/subr_csan.c optional kcsan \ - compile-with "${NORMAL_C:N-fsanitize*:N-fstack-protector*}" + compile-with "${NOSAN_C:N-fstack-protector*}" kern/subr_devstat.c standard kern/subr_disk.c standard kern/subr_early.c standard @@ -3914,7 +3914,7 @@ kern/subr_mchain.c optional libmchain kern/subr_memdesc.c standard kern/subr_module.c standard kern/subr_msan.c optional kmsan \ - compile-with "${NORMAL_C:N-fsanitize*:N-fno-sanitize*:N-fstack-protector*}" + compile-with "${NOSAN_C:N-fstack-protector*}" kern/subr_msgbuf.c standard kern/subr_param.c standard kern/subr_pcpu.c standard diff --git a/sys/conf/files.arm64 b/sys/conf/files.arm64 index 0ff18cc6dd99..0688aabb562b 100644 --- a/sys/conf/files.arm64 +++ b/sys/conf/files.arm64 @@ -81,7 +81,7 @@ arm64/arm64/trap.c standard arm64/arm64/uio_machdep.c standard arm64/arm64/undefined.c standard arm64/arm64/unwind.c optional ddb | kdtrace_hooks | stack \ - compile-with "${NORMAL_C:N-fsanitize*:N-fno-sanitize*}" + compile-with "${NOSAN_C}" arm64/arm64/vfp.c standard arm64/arm64/vm_machdep.c standard @@ -124,10 +124,10 @@ arm64/vmm/vmm_arm64.c optional vmm arm64/vmm/vmm_reset.c optional vmm arm64/vmm/vmm_call.S optional vmm arm64/vmm/vmm_hyp_exception.S optional vmm \ - compile-with "${NORMAL_C:N-fsanitize*:N-fno-sanitize*:N-mbranch-protection*} -fpie" \ + compile-with "${NOSAN_C:N-mbranch-protection*} -fpie" \ no-obj arm64/vmm/vmm_hyp.c optional vmm \ - compile-with "${NORMAL_C:N-fsanitize*:N-fno-sanitize*:N-mbranch-protection*} -fpie" \ + compile-with "${NOSAN_C:N-mbranch-protection*} -fpie" \ no-obj vmm_hyp_blob.elf.full optional vmm \ dependency "vmm_hyp.o vmm_hyp_exception.o" \ diff --git a/sys/conf/kern.mk b/sys/conf/kern.mk index 106f09d80ac1..93187d93ac63 100644 --- a/sys/conf/kern.mk +++ b/sys/conf/kern.mk @@ -294,6 +294,8 @@ CSTD?= gnu99 CFLAGS+= -std=${CSTD} .endif # CSTD +NOSAN_CFLAGS= ${CFLAGS:N-fsanitize*:N-fno-sanitize*:N-fasan-shadow-offset*} + # Please keep this if in sync with bsd.sys.mk .if ${LD} != "ld" && (${CC:[1]:H} != ${LD:[1]:H} || ${LD:[1]:T} != "ld") # Add -fuse-ld=${LD} if $LD is in a different directory or not called "ld". diff --git a/sys/conf/kern.post.mk b/sys/conf/kern.post.mk index 5760ebd987f1..5f6ec2db498d 100644 --- a/sys/conf/kern.post.mk +++ b/sys/conf/kern.post.mk @@ -245,21 +245,21 @@ offset.inc: $S/kern/genoffset.sh genoffset.o NM='${NM}' NMFLAGS='${NMFLAGS}' sh $S/kern/genoffset.sh genoffset.o > ${.TARGET} genoffset.o: $S/kern/genoffset.c - ${CC} -c ${CFLAGS:N-flto*:N-fno-common:N-fsanitize*:N-fno-sanitize*} \ + ${CC} -c ${NOSAN_CFLAGS:N-flto*:N-fno-common} \ -fcommon $S/kern/genoffset.c # genoffset_test.o is not actually used for anything - the point of compiling it # is to exercise the CTASSERT that checks that the offsets in the offset.inc # _lite struct(s) match those in the original(s). genoffset_test.o: $S/kern/genoffset.c offset.inc - ${CC} -c ${CFLAGS:N-flto*:N-fno-common:N-fsanitize*:N-fno-sanitize*} \ + ${CC} -c ${NOSAN_CFLAGS:N-flto*:N-fno-common} \ -fcommon -DOFFSET_TEST $S/kern/genoffset.c -o ${.TARGET} assym.inc: $S/kern/genassym.sh genassym.o genoffset_test.o NM='${NM}' NMFLAGS='${NMFLAGS}' sh $S/kern/genassym.sh genassym.o > ${.TARGET} genassym.o: $S/$M/$M/genassym.c offset.inc - ${CC} -c ${CFLAGS:N-flto*:N-fno-common:N-fsanitize*:N-fno-sanitize*} \ + ${CC} -c ${NOSAN_CFLAGS:N-flto*:N-fno-common} \ -fcommon $S/$M/$M/genassym.c OBJS_DEPEND_GUESS+= opt_global.h diff --git a/sys/conf/kern.pre.mk b/sys/conf/kern.pre.mk index 4d626cdd2aad..e787d023d9a9 100644 --- a/sys/conf/kern.pre.mk +++ b/sys/conf/kern.pre.mk @@ -129,11 +129,12 @@ KMSAN_ENABLED!= grep KMSAN opt_global.h || true ; echo .if !empty(KMSAN_ENABLED) # Disable -fno-sanitize-memory-param-retval until interceptors have been # updated to work properly with it. -SAN_CFLAGS+= -DSAN_NEEDS_INTERCEPTORS -DSAN_INTERCEPTOR_PREFIX=kmsan \ +MSAN_CFLAGS+= -DSAN_NEEDS_INTERCEPTORS -DSAN_INTERCEPTOR_PREFIX=kmsan \ -fsanitize=kernel-memory .if ${COMPILER_TYPE} == "clang" && ${COMPILER_VERSION} >= 160000 -SAN_CFLAGS+= -fno-sanitize-memory-param-retval +MSAN_CFLAGS+= -fno-sanitize-memory-param-retval .endif +SAN_CFLAGS+= ${MSAN_CFLAGS} .endif KUBSAN_ENABLED!= grep KUBSAN opt_global.h || true ; echo @@ -212,6 +213,10 @@ NORMAL_FWO= ${CC:N${CCACHE_BIN}} -c ${ASM_CFLAGS} ${WERROR} -o ${.TARGET} \ $S/kern/firmw.S -DFIRMW_FILE=\""${.ALLSRC:M*.fw}"\" \ -DFIRMW_SYMBOL="${.ALLSRC:M*.fw:C/[-.\/]/_/g}" +# Remove sanitizer arguments. Some -fno-sanitize* and -fasan-shadow-offset* +# arguments become an error if the appropriate sanitizer is not enabled. +NOSAN_C= ${NORMAL_C:N-fsanitize*:N-fno-sanitize*:N-fasan-shadow-offset*} + # for ZSTD in the kernel (include zstd/lib/freebsd before other CFLAGS) ZSTD_C= ${CC} -c -DZSTD_HEAPMODE=1 -I$S/contrib/zstd/lib/freebsd ${CFLAGS} \ -I$S/contrib/zstd/lib -I$S/contrib/zstd/lib/common ${WERROR} \ diff --git a/sys/conf/kmod.mk b/sys/conf/kmod.mk index ed186619434d..4dc66c69d67b 100644 --- a/sys/conf/kmod.mk +++ b/sys/conf/kmod.mk @@ -526,13 +526,13 @@ assym.inc: ${SYSDIR}/kern/genassym.sh sh ${SYSDIR}/kern/genassym.sh genassym.o > ${.TARGET} genassym.o: ${SYSDIR}/${MACHINE}/${MACHINE}/genassym.c offset.inc genassym.o: ${SRCS:Mopt_*.h} - ${CC} -c ${CFLAGS:N-flto*:N-fno-common:N-fsanitize*:N-fno-sanitize*} -fcommon \ + ${CC} -c ${NOSAN_CFLAGS:N-flto*:N-fno-common} -fcommon \ ${SYSDIR}/${MACHINE}/${MACHINE}/genassym.c offset.inc: ${SYSDIR}/kern/genoffset.sh genoffset.o sh ${SYSDIR}/kern/genoffset.sh genoffset.o > ${.TARGET} genoffset.o: ${SYSDIR}/kern/genoffset.c genoffset.o: ${SRCS:Mopt_*.h} - ${CC} -c ${CFLAGS:N-flto*:N-fno-common:N-fsanitize*:N-fno-sanitize*} -fcommon \ + ${CC} -c ${NOSAN_CFLAGS:N-flto*:N-fno-common} -fcommon \ ${SYSDIR}/kern/genoffset.c CLEANDEPENDFILES+= ${_ILINKS} diff --git a/sys/modules/linux64/Makefile b/sys/modules/linux64/Makefile index d558319f3ba2..b23891a65a4f 100644 --- a/sys/modules/linux64/Makefile +++ b/sys/modules/linux64/Makefile @@ -89,8 +89,7 @@ linux_support.o: linux_support.S assym.inc linux_assym.h ${.ALLSRC:M*.S:u} -o ${.TARGET} linux_genassym.o: offset.inc - ${CC} -c ${CFLAGS:N-flto*:N-fno-common:N-fsanitize*:N-fno-sanitize*} \ - -fcommon ${.IMPSRC} + ${CC} -c ${NOSAN_CFLAGS:N-flto*:N-fno-common} -fcommon ${.IMPSRC} .if !defined(KERNBUILDDIR) .warning Building Linuxulator outside of a kernel does not make sense diff --git a/sys/modules/vmm/Makefile b/sys/modules/vmm/Makefile index 88586a39bf96..055ae0df3a65 100644 --- a/sys/modules/vmm/Makefile +++ b/sys/modules/vmm/Makefile @@ -44,12 +44,11 @@ CLEANFILES+= vmm_hyp_blob.elf vmm_hyp_blob.bin vmm_hyp_exception.o: vmm_hyp_exception.S ${CC} -c -x assembler-with-cpp -DLOCORE \ - ${CFLAGS:N-fsanitize*:N-fno-sanitize*:N-mbranch-protection*} \ - ${.IMPSRC} -o ${.TARGET} -fpie + ${NOSAN_CFLAGS:N-mbranch-protection*} ${.IMPSRC} -o ${.TARGET} -fpie vmm_hyp.o: vmm_hyp.c - ${CC} -c ${CFLAGS:N-fsanitize*:N-fno-sanitize*:N-mbranch-protection*} \ - ${.IMPSRC} -o ${.TARGET} -fpie + ${CC} -c ${NOSAN_CFLAGS:N-mbranch-protection*} ${.IMPSRC} \ + -o ${.TARGET} -fpie vmm_hyp_blob.elf.full: vmm_hyp_exception.o vmm_hyp.o ${LD} -m ${LD_EMULATION} -Bdynamic -L ${SYSDIR}/conf -T ${SYSDIR}/conf/ldscript.arm64 \ @@ -135,14 +134,12 @@ svm_support.o: ${.IMPSRC} -o ${.TARGET} hyp_genassym.o: offset.inc - ${CC} -c ${CFLAGS:N-flto:N-fno-common} -fcommon ${.IMPSRC} + ${CC} -c ${NOSAN_CFLAGS:N-flto:N-fno-common} -fcommon ${.IMPSRC} vmx_genassym.o: offset.inc - ${CC} -c ${CFLAGS:N-flto*:N-fno-common:N-fsanitize*:N-fno-sanitize*} \ - -fcommon ${.IMPSRC} + ${CC} -c ${NOSAN_CFLAGS:N-flto*:N-fno-common} -fcommon ${.IMPSRC} svm_genassym.o: offset.inc - ${CC} -c ${CFLAGS:N-flto*:N-fno-common:N-fsanitize*:N-fno-sanitize*} \ - -fcommon ${.IMPSRC} + ${CC} -c ${NOSAN_CFLAGS:N-flto*:N-fno-common} -fcommon ${.IMPSRC} .include From 43e8849bc29414036ccaef7788de95a07ad32ab5 Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Mon, 19 Aug 2024 13:59:49 +0100 Subject: [PATCH 042/145] conf: Enable BTI checking in the arm64 kernel To ensure new code has BTI support make it an error to not have the BTI ELF note when linking the kernel and kernel modules. Reviewed by: kib, emaste Sponsored by: Arm Ltd Differential Revision: https://reviews.freebsd.org/D45469 --- sys/conf/kern.mk | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sys/conf/kern.mk b/sys/conf/kern.mk index 93187d93ac63..079bd1173fad 100644 --- a/sys/conf/kern.mk +++ b/sys/conf/kern.mk @@ -143,6 +143,9 @@ CFLAGS += -mgeneral-regs-only CFLAGS += -ffixed-x18 # Build with BTI+PAC CFLAGS += -mbranch-protection=standard +.if ${LINKER_TYPE} == "lld" +LDFLAGS += -Wl,-zbti-report=error +.endif # TODO: support outline atomics CFLAGS += -mno-outline-atomics INLINE_LIMIT?= 8000 From 4db15ab2c65e60f4d49d40ad6922ca301b184510 Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Mon, 19 Aug 2024 13:43:05 +0100 Subject: [PATCH 043/145] arm64: Add counter timer registers to armreg.h Reviewed by: imp Sponsored by: Arm Ltd Differential Revision: https://reviews.freebsd.org/D46071 --- sys/arm64/include/armreg.h | 48 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/sys/arm64/include/armreg.h b/sys/arm64/include/armreg.h index b2ab472dad0d..ce21bf4de0a9 100644 --- a/sys/arm64/include/armreg.h +++ b/sys/arm64/include/armreg.h @@ -231,6 +231,22 @@ #define CLIDR_CTYPE_ID 0x3 /* Split instruction and data */ #define CLIDR_CTYPE_UNIFIED 0x4 /* Unified */ +/* CNTKCTL_EL1 - Counter-timer Kernel Control Register */ +#define CNTKCTL_EL1 MRS_REG(CNTKCTL_EL0) +#define CNTKCTL_EL1_op0 3 +#define CNTKCTL_EL1_op1 0 +#define CNTKCTL_EL1_CRn 14 +#define CNTKCTL_EL1_CRm 1 +#define CNTKCTL_EL1_op2 0 + +/* CNTKCTL_EL12 - Counter-timer Kernel Control Register */ +#define CNTKCTL_EL12 MRS_REG(CNTKCTL_EL0) +#define CNTKCTL_EL12_op0 3 +#define CNTKCTL_EL12_op1 5 +#define CNTKCTL_EL12_CRn 14 +#define CNTKCTL_EL12_CRm 1 +#define CNTKCTL_EL12_op2 0 + /* CNTP_CTL_EL0 - Counter-timer Physical Timer Control register */ #define CNTP_CTL_EL0 MRS_REG(CNTP_CTL_EL0) #define CNTP_CTL_EL0_op0 3 @@ -266,6 +282,38 @@ #define CNTPCT_EL0_CRm 0 #define CNTPCT_EL0_op2 1 +/* CNTV_CTL_EL0 - Counter-timer Virtual Timer Control register */ +#define CNTV_CTL_EL0 MRS_REG(CNTV_CTL_EL0) +#define CNTV_CTL_EL0_op0 3 +#define CNTV_CTL_EL0_op1 3 +#define CNTV_CTL_EL0_CRn 14 +#define CNTV_CTL_EL0_CRm 3 +#define CNTV_CTL_EL0_op2 1 + +/* CNTV_CTL_EL02 - Counter-timer Virtual Timer Control register */ +#define CNTV_CTL_EL02 MRS_REG(CNTV_CTL_EL02) +#define CNTV_CTL_EL02_op0 3 +#define CNTV_CTL_EL02_op1 5 +#define CNTV_CTL_EL02_CRn 14 +#define CNTV_CTL_EL02_CRm 3 +#define CNTV_CTL_EL02_op2 1 + +/* CNTV_CVAL_EL0 - Counter-timer Virtual Timer CompareValue register */ +#define CNTV_CVAL_EL0 MRS_REG(CNTV_CVAL_EL0) +#define CNTV_CVAL_EL0_op0 3 +#define CNTV_CVAL_EL0_op1 3 +#define CNTV_CVAL_EL0_CRn 14 +#define CNTV_CVAL_EL0_CRm 3 +#define CNTV_CVAL_EL0_op2 2 + +/* CNTV_CVAL_EL02 - Counter-timer Virtual Timer CompareValue register */ +#define CNTV_CVAL_EL02 MRS_REG(CNTV_CVAL_EL02) +#define CNTV_CVAL_EL02_op0 3 +#define CNTV_CVAL_EL02_op1 5 +#define CNTV_CVAL_EL02_CRn 14 +#define CNTV_CVAL_EL02_CRm 3 +#define CNTV_CVAL_EL02_op2 2 + /* CONTEXTIDR_EL1 - Context ID register */ #define CONTEXTIDR_EL1 MRS_REG(CONTEXTIDR_EL1) #define CONTEXTIDR_EL1_REG MRS_REG_ALT_NAME(CONTEXTIDR_EL1) From 3d61bcf1eb8403780418096e4f520573acad6c0d Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Mon, 19 Aug 2024 13:43:22 +0100 Subject: [PATCH 044/145] arm64/vmm: Start to extract code not needed by VHE We can share some of the vmm code between VHE and non-VHE modes. To support this create new files that include the common code and create macros to name what will be the common functions. Sponsored by: Arm Ltd Differential Revision: https://reviews.freebsd.org/D46072 --- sys/arm64/vmm/vmm_hyp.c | 58 ++++++++++++++++-------------- sys/arm64/vmm/vmm_hyp_exception.S | 6 ++-- sys/arm64/vmm/vmm_nvhe.c | 31 ++++++++++++++++ sys/arm64/vmm/vmm_nvhe_exception.S | 30 ++++++++++++++++ sys/conf/files.arm64 | 6 ++-- sys/modules/vmm/Makefile | 9 ++--- 6 files changed, 103 insertions(+), 37 deletions(-) create mode 100644 sys/arm64/vmm/vmm_nvhe.c create mode 100644 sys/arm64/vmm/vmm_nvhe_exception.S diff --git a/sys/arm64/vmm/vmm_hyp.c b/sys/arm64/vmm/vmm_hyp.c index 9ff250e798e7..1226876aa642 100644 --- a/sys/arm64/vmm/vmm_hyp.c +++ b/sys/arm64/vmm/vmm_hyp.c @@ -41,7 +41,7 @@ struct hypctx; uint64_t vmm_hyp_enter(uint64_t, uint64_t, uint64_t, uint64_t, uint64_t, uint64_t, uint64_t, uint64_t); -uint64_t vmm_enter_guest(struct hypctx *); +uint64_t VMM_HYP_FUNC(do_call_guest)(struct hypctx *); static void vmm_hyp_reg_store(struct hypctx *hypctx, struct hyp *hyp, bool guest) @@ -496,7 +496,7 @@ vmm_hyp_call_guest(struct hyp *hyp, struct hypctx *hypctx) WRITE_SPECIALREG(mdcr_el2, hypctx->mdcr_el2); /* Call into the guest */ - ret = vmm_enter_guest(hypctx); + ret = VMM_HYP_FUNC(do_call_guest)(hypctx); WRITE_SPECIALREG(mdcr_el2, host_hypctx.mdcr_el2); isb(); @@ -566,8 +566,20 @@ vmm_hyp_call_guest(struct hyp *hyp, struct hypctx *hypctx) return (ret); } -static uint64_t -vmm_hyp_read_reg(uint64_t reg) +VMM_STATIC uint64_t +VMM_HYP_FUNC(enter_guest)(struct hyp *hyp, struct hypctx *hypctx) +{ + uint64_t ret; + + do { + ret = vmm_hyp_call_guest(hyp, hypctx); + } while (ret == EXCP_TYPE_REENTER); + + return (ret); +} + +VMM_STATIC uint64_t +VMM_HYP_FUNC(read_reg)(uint64_t reg) { switch (reg) { case HYP_REG_ICH_VTR: @@ -579,18 +591,16 @@ vmm_hyp_read_reg(uint64_t reg) return (0); } -static int -vmm_clean_s2_tlbi(void) +VMM_STATIC void +VMM_HYP_FUNC(clean_s2_tlbi)(void) { dsb(ishst); __asm __volatile("tlbi alle1is"); dsb(ish); - - return (0); } -static int -vm_s2_tlbi_range(uint64_t vttbr, vm_offset_t sva, vm_size_t eva, +VMM_STATIC void +VMM_HYP_FUNC(s2_tlbi_range)(uint64_t vttbr, vm_offset_t sva, vm_offset_t eva, bool final_only) { uint64_t end, r, start; @@ -634,12 +644,10 @@ vm_s2_tlbi_range(uint64_t vttbr, vm_offset_t sva, vm_size_t eva, /* Switch back t othe host vttbr */ WRITE_SPECIALREG(vttbr_el2, host_vttbr); isb(); - - return (0); } -static int -vm_s2_tlbi_all(uint64_t vttbr) +VMM_STATIC void +VMM_HYP_FUNC(s2_tlbi_all)(uint64_t vttbr) { uint64_t host_vttbr; @@ -656,8 +664,6 @@ vm_s2_tlbi_all(uint64_t vttbr) /* Switch back t othe host vttbr */ WRITE_SPECIALREG(vttbr_el2, host_vttbr); isb(); - - return (0); } static int @@ -705,27 +711,25 @@ uint64_t vmm_hyp_enter(uint64_t handle, uint64_t x1, uint64_t x2, uint64_t x3, uint64_t x4, uint64_t x5, uint64_t x6, uint64_t x7) { - uint64_t ret; - switch (handle) { case HYP_ENTER_GUEST: - do { - ret = vmm_hyp_call_guest((struct hyp *)x1, - (struct hypctx *)x2); - } while (ret == EXCP_TYPE_REENTER); - return (ret); + return (VMM_HYP_FUNC(enter_guest)((struct hyp *)x1, + (struct hypctx *)x2)); case HYP_READ_REGISTER: - return (vmm_hyp_read_reg(x1)); + return (VMM_HYP_FUNC(read_reg)(x1)); case HYP_CLEAN_S2_TLBI: - return (vmm_clean_s2_tlbi()); + VMM_HYP_FUNC(clean_s2_tlbi()); + return (0); case HYP_DC_CIVAC: return (vmm_dc_civac(x1, x2)); case HYP_EL2_TLBI: return (vmm_el2_tlbi(x1, x2, x3)); case HYP_S2_TLBI_RANGE: - return (vm_s2_tlbi_range(x1, x2, x3, x4)); + VMM_HYP_FUNC(s2_tlbi_range)(x1, x2, x3, x4); + return (0); case HYP_S2_TLBI_ALL: - return (vm_s2_tlbi_all(x1)); + VMM_HYP_FUNC(s2_tlbi_all)(x1); + return (0); case HYP_CLEANUP: /* Handled in vmm_hyp_exception.S */ default: break; diff --git a/sys/arm64/vmm/vmm_hyp_exception.S b/sys/arm64/vmm/vmm_hyp_exception.S index 9a9dc6901f40..3b9c08af97ac 100644 --- a/sys/arm64/vmm/vmm_hyp_exception.S +++ b/sys/arm64/vmm/vmm_hyp_exception.S @@ -349,12 +349,12 @@ LEND(handle_el2_el1_error64) /* * Usage: - * uint64_t vmm_enter_guest(struct hypctx *hypctx) + * uint64_t vmm_do_call_guest(struct hypctx *hypctx) * * Expecting: * x0 - hypctx address */ -ENTRY(vmm_enter_guest) +ENTRY(VMM_HYP_FUNC(do_call_guest)) /* Save hypctx address */ msr tpidr_el2, x0 @@ -363,7 +363,7 @@ ENTRY(vmm_enter_guest) /* Enter guest */ ERET -END(vmm_enter_guest) +END(VMM_HYP_FUNC(do_call_guest)) /* * Usage: diff --git a/sys/arm64/vmm/vmm_nvhe.c b/sys/arm64/vmm/vmm_nvhe.c new file mode 100644 index 000000000000..768e2132522d --- /dev/null +++ b/sys/arm64/vmm/vmm_nvhe.c @@ -0,0 +1,31 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2024 Arm Ltd + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#define VMM_STATIC static +#define VMM_HYP_FUNC(func) vmm_nvhe_ ## func + +#include "vmm_hyp.c" diff --git a/sys/arm64/vmm/vmm_nvhe_exception.S b/sys/arm64/vmm/vmm_nvhe_exception.S new file mode 100644 index 000000000000..3bc0ff591399 --- /dev/null +++ b/sys/arm64/vmm/vmm_nvhe_exception.S @@ -0,0 +1,30 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2024 Arm Ltd + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#define VMM_HYP_FUNC(func) vmm_nvhe_ ## func + +#include "vmm_hyp_exception.S" diff --git a/sys/conf/files.arm64 b/sys/conf/files.arm64 index 0688aabb562b..10590d6c77e2 100644 --- a/sys/conf/files.arm64 +++ b/sys/conf/files.arm64 @@ -123,14 +123,14 @@ arm64/vmm/vmm_stat.c optional vmm arm64/vmm/vmm_arm64.c optional vmm arm64/vmm/vmm_reset.c optional vmm arm64/vmm/vmm_call.S optional vmm -arm64/vmm/vmm_hyp_exception.S optional vmm \ +arm64/vmm/vmm_nvhe_exception.S optional vmm \ compile-with "${NOSAN_C:N-mbranch-protection*} -fpie" \ no-obj -arm64/vmm/vmm_hyp.c optional vmm \ +arm64/vmm/vmm_nvhe.c optional vmm \ compile-with "${NOSAN_C:N-mbranch-protection*} -fpie" \ no-obj vmm_hyp_blob.elf.full optional vmm \ - dependency "vmm_hyp.o vmm_hyp_exception.o" \ + dependency "vmm_nvhe.o vmm_hyp_exception.o" \ compile-with "${SYSTEM_LD_BASECMD} -o ${.TARGET} ${.ALLSRC} --defsym=_start='0x0' --defsym=text_start='0x0'" \ no-obj no-implicit-rule vmm_hyp_blob.elf optional vmm \ diff --git a/sys/modules/vmm/Makefile b/sys/modules/vmm/Makefile index 055ae0df3a65..409804f4e25c 100644 --- a/sys/modules/vmm/Makefile +++ b/sys/modules/vmm/Makefile @@ -38,19 +38,20 @@ SRCS+= vgic.c \ vgic_v3.c \ vtimer.c -CLEANFILES+= vmm_hyp_exception.o vmm_hyp.o +CLEANFILES+= vmm_nvhe_exception.o vmm_nvhe.o + CLEANFILES+= vmm_hyp_blob.elf.full CLEANFILES+= vmm_hyp_blob.elf vmm_hyp_blob.bin -vmm_hyp_exception.o: vmm_hyp_exception.S +vmm_nvhe_exception.o: vmm_nvhe_exception.S ${CC} -c -x assembler-with-cpp -DLOCORE \ ${NOSAN_CFLAGS:N-mbranch-protection*} ${.IMPSRC} -o ${.TARGET} -fpie -vmm_hyp.o: vmm_hyp.c +vmm_nvhe.o: vmm_nvhe.c ${CC} -c ${NOSAN_CFLAGS:N-mbranch-protection*} ${.IMPSRC} \ -o ${.TARGET} -fpie -vmm_hyp_blob.elf.full: vmm_hyp_exception.o vmm_hyp.o +vmm_hyp_blob.elf.full: vmm_nvhe_exception.o vmm_nvhe.o ${LD} -m ${LD_EMULATION} -Bdynamic -L ${SYSDIR}/conf -T ${SYSDIR}/conf/ldscript.arm64 \ ${_LDFLAGS:N-zbti-report*} --no-warn-mismatch --warn-common --export-dynamic \ --dynamic-linker /red/herring -X -o ${.TARGET} ${.ALLSRC} \ From 4b6c9f84deb53b111ccbcbea595b2cdd347ca7f9 Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Mon, 19 Aug 2024 13:43:31 +0100 Subject: [PATCH 045/145] arm64/vmm: Move nVHE-only code to the new file There are some functions that are only needed in non-VHE mode. These are used to handle hypervisor calls from the kernel, and to manage the page tables in EL2. As these won't be used by the VHE code we can move them to the non-VHE specific files. Sponsored by: Arm Ltd Differential Revision: https://reviews.freebsd.org/D46073 --- sys/arm64/vmm/vmm_hyp.c | 74 ------------------------ sys/arm64/vmm/vmm_hyp_exception.S | 88 ----------------------------- sys/arm64/vmm/vmm_nvhe.c | 83 +++++++++++++++++++++++++++ sys/arm64/vmm/vmm_nvhe_exception.S | 90 ++++++++++++++++++++++++++++++ 4 files changed, 173 insertions(+), 162 deletions(-) diff --git a/sys/arm64/vmm/vmm_hyp.c b/sys/arm64/vmm/vmm_hyp.c index 1226876aa642..92f8cd25251d 100644 --- a/sys/arm64/vmm/vmm_hyp.c +++ b/sys/arm64/vmm/vmm_hyp.c @@ -39,8 +39,6 @@ struct hypctx; -uint64_t vmm_hyp_enter(uint64_t, uint64_t, uint64_t, uint64_t, uint64_t, - uint64_t, uint64_t, uint64_t); uint64_t VMM_HYP_FUNC(do_call_guest)(struct hypctx *); static void @@ -665,75 +663,3 @@ VMM_HYP_FUNC(s2_tlbi_all)(uint64_t vttbr) WRITE_SPECIALREG(vttbr_el2, host_vttbr); isb(); } - -static int -vmm_dc_civac(uint64_t start, uint64_t len) -{ - size_t line_size, end; - uint64_t ctr; - - ctr = READ_SPECIALREG(ctr_el0); - line_size = sizeof(int) << CTR_DLINE_SIZE(ctr); - end = start + len; - dsb(ishst); - /* Clean and Invalidate the D-cache */ - for (; start < end; start += line_size) - __asm __volatile("dc civac, %0" :: "r" (start) : "memory"); - dsb(ish); - return (0); -} - -static int -vmm_el2_tlbi(uint64_t type, uint64_t start, uint64_t len) -{ - uint64_t end, r; - - dsb(ishst); - switch (type) { - default: - case HYP_EL2_TLBI_ALL: - __asm __volatile("tlbi alle2" ::: "memory"); - break; - case HYP_EL2_TLBI_VA: - end = TLBI_VA(start + len); - start = TLBI_VA(start); - for (r = start; r < end; r += TLBI_VA_L3_INCR) { - __asm __volatile("tlbi vae2is, %0" :: "r"(r)); - } - break; - } - dsb(ish); - - return (0); -} - -uint64_t -vmm_hyp_enter(uint64_t handle, uint64_t x1, uint64_t x2, uint64_t x3, - uint64_t x4, uint64_t x5, uint64_t x6, uint64_t x7) -{ - switch (handle) { - case HYP_ENTER_GUEST: - return (VMM_HYP_FUNC(enter_guest)((struct hyp *)x1, - (struct hypctx *)x2)); - case HYP_READ_REGISTER: - return (VMM_HYP_FUNC(read_reg)(x1)); - case HYP_CLEAN_S2_TLBI: - VMM_HYP_FUNC(clean_s2_tlbi()); - return (0); - case HYP_DC_CIVAC: - return (vmm_dc_civac(x1, x2)); - case HYP_EL2_TLBI: - return (vmm_el2_tlbi(x1, x2, x3)); - case HYP_S2_TLBI_RANGE: - VMM_HYP_FUNC(s2_tlbi_range)(x1, x2, x3, x4); - return (0); - case HYP_S2_TLBI_ALL: - VMM_HYP_FUNC(s2_tlbi_all)(x1); - return (0); - case HYP_CLEANUP: /* Handled in vmm_hyp_exception.S */ - default: - break; - } - - return (0); -} diff --git a/sys/arm64/vmm/vmm_hyp_exception.S b/sys/arm64/vmm/vmm_hyp_exception.S index 3b9c08af97ac..9a1648130f49 100644 --- a/sys/arm64/vmm/vmm_hyp_exception.S +++ b/sys/arm64/vmm/vmm_hyp_exception.S @@ -145,29 +145,6 @@ b handle_\name .endm - .section ".vmm_vectors","ax" - .align 11 -hyp_init_vectors: - vempty /* Synchronous EL2t */ - vempty /* IRQ EL2t */ - vempty /* FIQ EL2t */ - vempty /* Error EL2t */ - - vempty /* Synchronous EL2h */ - vempty /* IRQ EL2h */ - vempty /* FIQ EL2h */ - vempty /* Error EL2h */ - - vector hyp_init /* Synchronous 64-bit EL1 */ - vempty /* IRQ 64-bit EL1 */ - vempty /* FIQ 64-bit EL1 */ - vempty /* Error 64-bit EL1 */ - - vempty /* Synchronous 32-bit EL1 */ - vempty /* IRQ 32-bit EL1 */ - vempty /* FIQ 32-bit EL1 */ - vempty /* Error 32-bit EL1 */ - .text .align 11 hyp_vectors: @@ -191,50 +168,6 @@ hyp_vectors: vempty /* FIQ 32-bit EL1 */ vempty /* Error 32-bit EL1 */ -/* - * Initialize the hypervisor mode with a new exception vector table, translation - * table and stack. - * - * Expecting: - * x0 - translation tables physical address - * x1 - stack top virtual address - * x2 - TCR_EL2 value - * x3 - SCTLR_EL2 value - * x4 - VTCR_EL2 value - */ -LENTRY(handle_hyp_init) - /* Install the new exception vectors */ - adrp x6, hyp_vectors - add x6, x6, :lo12:hyp_vectors - msr vbar_el2, x6 - /* Set the stack top address */ - mov sp, x1 - /* Use the host VTTBR_EL2 to tell the host and the guests apart */ - mov x9, #VTTBR_HOST - msr vttbr_el2, x9 - /* Load the base address for the translation tables */ - msr ttbr0_el2, x0 - /* Invalidate the TLB */ - dsb ish - tlbi alle2 - dsb ishst - isb - /* Use the same memory attributes as EL1 */ - mrs x9, mair_el1 - msr mair_el2, x9 - /* Configure address translation */ - msr tcr_el2, x2 - isb - /* Set the system control register for EL2 */ - msr sctlr_el2, x3 - /* Set the Stage 2 translation control register */ - msr vtcr_el2, x4 - /* Return success */ - mov x0, #0 - /* MMU is up and running */ - ERET -LEND(handle_hyp_init) - .macro do_world_switch_to_host save_guest_registers restore_host_registers @@ -364,24 +297,3 @@ ENTRY(VMM_HYP_FUNC(do_call_guest)) /* Enter guest */ ERET END(VMM_HYP_FUNC(do_call_guest)) - -/* - * Usage: - * void vmm_cleanup(uint64_t handle, void *hyp_stub_vectors) - * - * Expecting: - * x1 - physical address of hyp_stub_vectors - */ -LENTRY(vmm_cleanup) - /* Restore the stub vectors */ - msr vbar_el2, x1 - - /* Disable the MMU */ - dsb sy - mrs x2, sctlr_el2 - bic x2, x2, #SCTLR_EL2_M - msr sctlr_el2, x2 - isb - - ERET -LEND(vmm_cleanup) diff --git a/sys/arm64/vmm/vmm_nvhe.c b/sys/arm64/vmm/vmm_nvhe.c index 768e2132522d..1127a6e37781 100644 --- a/sys/arm64/vmm/vmm_nvhe.c +++ b/sys/arm64/vmm/vmm_nvhe.c @@ -1,8 +1,12 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * + * Copyright (c) 2021 Andrew Turner * Copyright (c) 2024 Arm Ltd * + * This work was supported by Innovate UK project 105694, "Digital Security + * by Design (DSbD) Technology Platform Prototype". + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -29,3 +33,82 @@ #define VMM_HYP_FUNC(func) vmm_nvhe_ ## func #include "vmm_hyp.c" + +uint64_t vmm_hyp_enter(uint64_t, uint64_t, uint64_t, uint64_t, uint64_t, + uint64_t, uint64_t, uint64_t); + +/* + * Handlers for EL2 addres space. Only needed by non-VHE code as in VHE the + * kernel is in EL2 so pmap will manage the address space. + */ +static int +vmm_dc_civac(uint64_t start, uint64_t len) +{ + size_t line_size, end; + uint64_t ctr; + + ctr = READ_SPECIALREG(ctr_el0); + line_size = sizeof(int) << CTR_DLINE_SIZE(ctr); + end = start + len; + dsb(ishst); + /* Clean and Invalidate the D-cache */ + for (; start < end; start += line_size) + __asm __volatile("dc civac, %0" :: "r" (start) : "memory"); + dsb(ish); + return (0); +} + +static int +vmm_el2_tlbi(uint64_t type, uint64_t start, uint64_t len) +{ + uint64_t end, r; + + dsb(ishst); + switch (type) { + default: + case HYP_EL2_TLBI_ALL: + __asm __volatile("tlbi alle2" ::: "memory"); + break; + case HYP_EL2_TLBI_VA: + end = TLBI_VA(start + len); + start = TLBI_VA(start); + for (r = start; r < end; r += TLBI_VA_L3_INCR) { + __asm __volatile("tlbi vae2is, %0" :: "r"(r)); + } + break; + } + dsb(ish); + + return (0); +} + +uint64_t +vmm_hyp_enter(uint64_t handle, uint64_t x1, uint64_t x2, uint64_t x3, + uint64_t x4, uint64_t x5, uint64_t x6, uint64_t x7) +{ + switch (handle) { + case HYP_ENTER_GUEST: + return (VMM_HYP_FUNC(enter_guest)((struct hyp *)x1, + (struct hypctx *)x2)); + case HYP_READ_REGISTER: + return (VMM_HYP_FUNC(read_reg)(x1)); + case HYP_CLEAN_S2_TLBI: + VMM_HYP_FUNC(clean_s2_tlbi()); + return (0); + case HYP_DC_CIVAC: + return (vmm_dc_civac(x1, x2)); + case HYP_EL2_TLBI: + return (vmm_el2_tlbi(x1, x2, x3)); + case HYP_S2_TLBI_RANGE: + VMM_HYP_FUNC(s2_tlbi_range)(x1, x2, x3, x4); + return (0); + case HYP_S2_TLBI_ALL: + VMM_HYP_FUNC(s2_tlbi_all)(x1); + return (0); + case HYP_CLEANUP: /* Handled in vmm_hyp_exception.S */ + default: + break; + } + + return (0); +} diff --git a/sys/arm64/vmm/vmm_nvhe_exception.S b/sys/arm64/vmm/vmm_nvhe_exception.S index 3bc0ff591399..17bc4cb70366 100644 --- a/sys/arm64/vmm/vmm_nvhe_exception.S +++ b/sys/arm64/vmm/vmm_nvhe_exception.S @@ -28,3 +28,93 @@ #define VMM_HYP_FUNC(func) vmm_nvhe_ ## func #include "vmm_hyp_exception.S" + + .section ".vmm_vectors","ax" + .align 11 +hyp_init_vectors: + vempty /* Synchronous EL2t */ + vempty /* IRQ EL2t */ + vempty /* FIQ EL2t */ + vempty /* Error EL2t */ + + vempty /* Synchronous EL2h */ + vempty /* IRQ EL2h */ + vempty /* FIQ EL2h */ + vempty /* Error EL2h */ + + vector hyp_init /* Synchronous 64-bit EL1 */ + vempty /* IRQ 64-bit EL1 */ + vempty /* FIQ 64-bit EL1 */ + vempty /* Error 64-bit EL1 */ + + vempty /* Synchronous 32-bit EL1 */ + vempty /* IRQ 32-bit EL1 */ + vempty /* FIQ 32-bit EL1 */ + vempty /* Error 32-bit EL1 */ + + .text + +/* + * Initialize the hypervisor mode with a new exception vector table, translation + * table and stack. + * + * Expecting: + * x0 - translation tables physical address + * x1 - stack top virtual address + * x2 - TCR_EL2 value + * x3 - SCTLR_EL2 value + * x4 - VTCR_EL2 value + */ +LENTRY(handle_hyp_init) + /* Install the new exception vectors */ + adrp x6, hyp_vectors + add x6, x6, :lo12:hyp_vectors + msr vbar_el2, x6 + /* Set the stack top address */ + mov sp, x1 + /* Use the host VTTBR_EL2 to tell the host and the guests apart */ + mov x9, #VTTBR_HOST + msr vttbr_el2, x9 + /* Load the base address for the translation tables */ + msr ttbr0_el2, x0 + /* Invalidate the TLB */ + dsb ish + tlbi alle2 + dsb ishst + isb + /* Use the same memory attributes as EL1 */ + mrs x9, mair_el1 + msr mair_el2, x9 + /* Configure address translation */ + msr tcr_el2, x2 + isb + /* Set the system control register for EL2 */ + msr sctlr_el2, x3 + /* Set the Stage 2 translation control register */ + msr vtcr_el2, x4 + /* Return success */ + mov x0, #0 + /* MMU is up and running */ + ERET +LEND(handle_hyp_init) + +/* + * Usage: + * void vmm_cleanup(uint64_t handle, void *hyp_stub_vectors) + * + * Expecting: + * x1 - physical address of hyp_stub_vectors + */ +LENTRY(vmm_cleanup) + /* Restore the stub vectors */ + msr vbar_el2, x1 + + /* Disable the MMU */ + dsb sy + mrs x2, sctlr_el2 + bic x2, x2, #SCTLR_EL2_M + msr sctlr_el2, x2 + isb + + ERET +LEND(vmm_cleanup) From a745cdc19b7f92b490f7c332abad82945f3b06cb Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Mon, 19 Aug 2024 13:43:37 +0100 Subject: [PATCH 046/145] arm64/vmm: Teach the vtimer about VHE Teach the virtual timer about the cnthctl_el2 field layout under VHE. As with non-VHE we need to trap the physical timer and not trap the virtual timer. Sponsored by: Arm Ltd Differential Revision: https://reviews.freebsd.org/D46074 --- sys/arm64/include/hypervisor.h | 4 ++++ sys/arm64/vmm/io/vtimer.c | 38 +++++++++++++++++++++++++++++----- 2 files changed, 37 insertions(+), 5 deletions(-) diff --git a/sys/arm64/include/hypervisor.h b/sys/arm64/include/hypervisor.h index 011f86e83fdf..4c501e2722a9 100644 --- a/sys/arm64/include/hypervisor.h +++ b/sys/arm64/include/hypervisor.h @@ -41,6 +41,10 @@ #define CNTHCTL_EL1PCTEN (1 << 0) /* Allow physical counter access */ #define CNTHCTL_EL1PCEN (1 << 1) /* Allow physical timer access */ /* Valid if HCR_EL2.E2H == 1 */ +#define CNTHCTL_E2H_EL0PCTEN (1 << 0) /* Allow EL0 physical counter access */ +#define CNTHCTL_E2H_EL0VCTEN (1 << 1) /* Allow EL0 virtual counter access */ +#define CNTHCTL_E2H_EL0VTEN (1 << 8) +#define CNTHCTL_E2H_EL0PTEN (1 << 9) #define CNTHCTL_E2H_EL1PCTEN (1 << 10) /* Allow physical counter access */ #define CNTHCTL_E2H_EL1PTEN (1 << 11) /* Allow physical timer access */ /* Unconditionally valid */ diff --git a/sys/arm64/vmm/io/vtimer.c b/sys/arm64/vmm/io/vtimer.c index aa0b3ff1588e..f59d7ebc1ad4 100644 --- a/sys/arm64/vmm/io/vtimer.c +++ b/sys/arm64/vmm/io/vtimer.c @@ -129,14 +129,42 @@ vtimer_vminit(struct hyp *hyp) { uint64_t now; + hyp->vtimer.cnthctl_el2 = cnthctl_el2_reg; + /* * Configure the Counter-timer Hypervisor Control Register for the VM. - * - * CNTHCTL_EL1PCEN: trap access to CNTP_{CTL, CVAL, TVAL}_EL0 from EL1 - * CNTHCTL_EL1PCTEN: trap access to CNTPCT_EL0 */ - hyp->vtimer.cnthctl_el2 = cnthctl_el2_reg & ~CNTHCTL_EL1PCEN; - hyp->vtimer.cnthctl_el2 &= ~CNTHCTL_EL1PCTEN; + if (in_vhe()) { + /* + * CNTHCTL_E2H_EL0PCTEN: trap EL0 access to CNTP{CT,CTSS}_EL0 + * CNTHCTL_E2H_EL1VCTEN: don't trap EL0 access to + * CNTV{CT,CTSS}_EL0 + * CNTHCTL_E2H_EL0VTEN: don't trap EL0 access to + * CNTV_{CTL,CVAL,TVAL}_EL0 + * CNTHCTL_E2H_EL0PTEN: trap EL0 access to + * CNTP_{CTL,CVAL,TVAL}_EL0 + * CNTHCTL_E2H_EL1PCEN: trap EL1 access to + CNTP_{CTL,CVAL,TVAL}_EL0 + * CNTHCTL_E2H_EL1PCTEN: trap access to CNTPCT_EL0 + * + * TODO: Don't trap when FEAT_ECV is present + */ + hyp->vtimer.cnthctl_el2 &= ~CNTHCTL_E2H_EL0PCTEN; + hyp->vtimer.cnthctl_el2 |= CNTHCTL_E2H_EL0VCTEN; + hyp->vtimer.cnthctl_el2 |= CNTHCTL_E2H_EL0VTEN; + hyp->vtimer.cnthctl_el2 &= ~CNTHCTL_E2H_EL0PTEN; + + hyp->vtimer.cnthctl_el2 &= ~CNTHCTL_E2H_EL1PTEN; + hyp->vtimer.cnthctl_el2 &= ~CNTHCTL_E2H_EL1PCTEN; + } else { + /* + * CNTHCTL_EL1PCEN: trap access to CNTP_{CTL, CVAL, TVAL}_EL0 + * from EL1 + * CNTHCTL_EL1PCTEN: trap access to CNTPCT_EL0 + */ + hyp->vtimer.cnthctl_el2 &= ~CNTHCTL_EL1PCEN; + hyp->vtimer.cnthctl_el2 &= ~CNTHCTL_EL1PCTEN; + } now = READ_SPECIALREG(cntpct_el0); hyp->vtimer.cntvoff_el2 = now; From 55aa31480ced477610b7cb0a948af6e99fefe864 Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Mon, 19 Aug 2024 13:43:42 +0100 Subject: [PATCH 047/145] arm64/vmm: Create functions to call into EL2 These will become ifuncs to enable VHE in a later change. Sponsored by: Arm Ltd Differential Revision: https://reviews.freebsd.org/D46075 --- sys/arm64/vmm/io/vgic_v3.c | 3 +- sys/arm64/vmm/vmm_arm64.c | 38 ++++----------------- sys/arm64/vmm/vmm_handlers.c | 65 ++++++++++++++++++++++++++++++++++++ sys/arm64/vmm/vmm_handlers.h | 48 ++++++++++++++++++++++++++ sys/conf/files.arm64 | 1 + sys/modules/vmm/Makefile | 1 + 6 files changed, 123 insertions(+), 33 deletions(-) create mode 100644 sys/arm64/vmm/vmm_handlers.c create mode 100644 sys/arm64/vmm/vmm_handlers.h diff --git a/sys/arm64/vmm/io/vgic_v3.c b/sys/arm64/vmm/io/vgic_v3.c index 7ed591c409ba..67afb3374815 100644 --- a/sys/arm64/vmm/io/vgic_v3.c +++ b/sys/arm64/vmm/io/vgic_v3.c @@ -68,6 +68,7 @@ #include #include #include +#include #include "vgic.h" #include "vgic_v3.h" @@ -2252,7 +2253,7 @@ vgic_v3_init(device_t dev) uint64_t ich_vtr_el2; uint32_t pribits, prebits; - ich_vtr_el2 = vmm_call_hyp(HYP_READ_REGISTER, HYP_REG_ICH_VTR); + ich_vtr_el2 = vmm_read_reg(HYP_REG_ICH_VTR); /* TODO: These fields are common with the vgicv2 driver */ pribits = ICH_VTR_EL2_PRIBITS(ich_vtr_el2); diff --git a/sys/arm64/vmm/vmm_arm64.c b/sys/arm64/vmm/vmm_arm64.c index e5eee47b405b..1b73ed019fad 100644 --- a/sys/arm64/vmm/vmm_arm64.c +++ b/sys/arm64/vmm/vmm_arm64.c @@ -65,6 +65,7 @@ #include "io/vgic.h" #include "io/vgic_v3.h" #include "io/vtimer.h" +#include "vmm_handlers.h" #include "vmm_stat.h" #define HANDLED 1 @@ -101,9 +102,6 @@ static vm_offset_t stack_hyp_va[MAXCPU]; static vmem_t *el2_mem_alloc; static void arm_setup_vectors(void *arg); -static void vmm_pmap_clean_stage2_tlbi(void); -static void vmm_pmap_invalidate_range(uint64_t, vm_offset_t, vm_offset_t, bool); -static void vmm_pmap_invalidate_all(uint64_t); DPCPU_DEFINE_STATIC(struct hypctx *, vcpu); @@ -235,7 +233,6 @@ vmmops_modinit(int ipinum) vm_paddr_t vmm_base; uint64_t id_aa64mmfr0_el1, pa_range_bits, pa_range_field; uint64_t cnthctl_el2; - register_t daif; int cpu, i; bool rv __diagused; @@ -291,9 +288,9 @@ vmmops_modinit(int ipinum) /* Set up the stage 2 pmap callbacks */ MPASS(pmap_clean_stage2_tlbi == NULL); - pmap_clean_stage2_tlbi = vmm_pmap_clean_stage2_tlbi; - pmap_stage2_invalidate_range = vmm_pmap_invalidate_range; - pmap_stage2_invalidate_all = vmm_pmap_invalidate_all; + pmap_clean_stage2_tlbi = vmm_clean_s2_tlbi; + pmap_stage2_invalidate_range = vmm_s2_tlbi_range; + pmap_stage2_invalidate_all = vmm_s2_tlbi_all; /* * Create an allocator for the virtual address space used by EL2. @@ -429,9 +426,7 @@ vmmops_modinit(int ipinum) vmem_add(el2_mem_alloc, next_hyp_va, HYP_VM_MAX_ADDRESS - next_hyp_va, M_WAITOK); - daif = intr_disable(); - cnthctl_el2 = vmm_call_hyp(HYP_READ_REGISTER, HYP_REG_CNTHCTL); - intr_restore(daif); + cnthctl_el2 = vmm_read_reg(HYP_REG_CNTHCTL); vgic_init(); vtimer_init(cnthctl_el2); @@ -567,26 +562,6 @@ vmmops_vmspace_free(struct vmspace *vmspace) vmspace_free(vmspace); } -static void -vmm_pmap_clean_stage2_tlbi(void) -{ - vmm_call_hyp(HYP_CLEAN_S2_TLBI); -} - -static void -vmm_pmap_invalidate_range(uint64_t vttbr, vm_offset_t sva, vm_offset_t eva, - bool final_only) -{ - MPASS(eva > sva); - vmm_call_hyp(HYP_S2_TLBI_RANGE, vttbr, sva, eva, final_only); -} - -static void -vmm_pmap_invalidate_all(uint64_t vttbr) -{ - vmm_call_hyp(HYP_S2_TLBI_ALL, vttbr); -} - static inline void arm64_print_hyp_regs(struct vm_exit *vme) { @@ -1143,8 +1118,7 @@ vmmops_run(void *vcpui, register_t pc, pmap_t pmap, struct vm_eventinfo *evinfo) vgic_flush_hwstate(hypctx); /* Call into EL2 to switch to the guest */ - excp_type = vmm_call_hyp(HYP_ENTER_GUEST, - hyp->el2_addr, hypctx->el2_addr); + excp_type = vmm_enter_guest(hyp, hypctx); vgic_sync_hwstate(hypctx); vtimer_sync_hwstate(hypctx); diff --git a/sys/arm64/vmm/vmm_handlers.c b/sys/arm64/vmm/vmm_handlers.c new file mode 100644 index 000000000000..2ce674d5ba46 --- /dev/null +++ b/sys/arm64/vmm/vmm_handlers.c @@ -0,0 +1,65 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2024 Arm Ltd + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include + +#include + +#include "arm64.h" +#include "vmm_handlers.h" + +uint64_t +vmm_read_reg(uint64_t reg) +{ + return (vmm_call_hyp(HYP_READ_REGISTER, reg)); +} + +uint64_t +vmm_enter_guest(struct hyp *hyp, struct hypctx *hypctx) +{ + return (vmm_call_hyp(HYP_ENTER_GUEST, hyp->el2_addr, hypctx->el2_addr)); +} + +void +vmm_clean_s2_tlbi(void) +{ + vmm_call_hyp(HYP_CLEAN_S2_TLBI); +} + +void +vmm_s2_tlbi_range(uint64_t vttbr, vm_offset_t sva, vm_offset_t eva, + bool final_only) +{ + vmm_call_hyp(HYP_S2_TLBI_RANGE, vttbr, sva, eva, final_only); +} + +void +vmm_s2_tlbi_all(uint64_t vttbr) +{ + vmm_call_hyp(HYP_S2_TLBI_ALL, vttbr); +} diff --git a/sys/arm64/vmm/vmm_handlers.h b/sys/arm64/vmm/vmm_handlers.h new file mode 100644 index 000000000000..f651fce6f32d --- /dev/null +++ b/sys/arm64/vmm/vmm_handlers.h @@ -0,0 +1,48 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2024 Arm Ltd + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMM_VMM_HANDLERS_H_ +#define _VMM_VMM_HANDLERS_H_ + +#include + +struct hyp; +struct hypctx; + +void vmm_clean_s2_tlbi(void); +uint64_t vmm_enter_guest(struct hyp *, struct hypctx *); +uint64_t vmm_read_reg(uint64_t); +void vmm_s2_tlbi_range(uint64_t, vm_offset_t, vm_offset_t, bool); +void vmm_s2_tlbi_all(uint64_t); + +void vmm_vhe_clean_s2_tlbi(void); +uint64_t vmm_vhe_enter_guest(struct hyp *, struct hypctx *); +uint64_t vmm_vhe_read_reg(uint64_t); +void vmm_vhe_s2_tlbi_range(uint64_t, vm_offset_t, vm_offset_t, bool); +void vmm_vhe_s2_tlbi_all(uint64_t); + +#endif /* _VMM_VMM_HANDLERS_H_ */ diff --git a/sys/conf/files.arm64 b/sys/conf/files.arm64 index 10590d6c77e2..b522177221e5 100644 --- a/sys/conf/files.arm64 +++ b/sys/conf/files.arm64 @@ -122,6 +122,7 @@ arm64/vmm/vmm_instruction_emul.c optional vmm arm64/vmm/vmm_stat.c optional vmm arm64/vmm/vmm_arm64.c optional vmm arm64/vmm/vmm_reset.c optional vmm +arm64/vmm/vmm_handlers.c optional vmm arm64/vmm/vmm_call.S optional vmm arm64/vmm/vmm_nvhe_exception.S optional vmm \ compile-with "${NOSAN_C:N-mbranch-protection*} -fpie" \ diff --git a/sys/modules/vmm/Makefile b/sys/modules/vmm/Makefile index 409804f4e25c..a3a878a653ff 100644 --- a/sys/modules/vmm/Makefile +++ b/sys/modules/vmm/Makefile @@ -28,6 +28,7 @@ DPSRCS+= assym.inc SRCS+= vmm_arm64.c \ vmm_reset.c \ vmm_call.S \ + vmm_handlers.c \ vmm_mmu.c \ vmm_hyp_el2.S From 387f878aa7afdc48cdd304a9c2f5e6806639f6f0 Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Mon, 19 Aug 2024 13:43:46 +0100 Subject: [PATCH 048/145] arm64/vmm: Teach vmm_arm.c about VHE Most of the code is identical however some, e.g. managing EL2 memory or setting EL2 registers, are unneeded under VHE as the kernel is in EL2 so can manage these directly. Sponsored by: Arm Ltd Differential Revision: https://reviews.freebsd.org/D46076 --- sys/arm64/vmm/vmm_arm64.c | 224 ++++++++++++++++++++------------------ sys/arm64/vmm/vmm_reset.c | 8 +- 2 files changed, 127 insertions(+), 105 deletions(-) diff --git a/sys/arm64/vmm/vmm_arm64.c b/sys/arm64/vmm/vmm_arm64.c index 1b73ed019fad..3079353668e3 100644 --- a/sys/arm64/vmm/vmm_arm64.c +++ b/sys/arm64/vmm/vmm_arm64.c @@ -128,20 +128,6 @@ arm_setup_vectors(void *arg) el2_regs = arg; arm64_set_active_vcpu(NULL); - daif = intr_disable(); - - /* - * Install the temporary vectors which will be responsible for - * initializing the VMM when we next trap into EL2. - * - * x0: the exception vector table responsible for hypervisor - * initialization on the next call. - */ - vmm_call_hyp(vtophys(&vmm_hyp_code)); - - /* Create and map the hypervisor stack */ - stack_top = stack_hyp_va[PCPU_GET(cpuid)] + VMM_STACK_SIZE; - /* * Configure the system control register for EL2: * @@ -159,9 +145,27 @@ arm_setup_vectors(void *arg) sctlr_el2 |= SCTLR_EL2_WXN; sctlr_el2 &= ~SCTLR_EL2_EE; - /* Special call to initialize EL2 */ - vmm_call_hyp(vmmpmap_to_ttbr0(), stack_top, el2_regs->tcr_el2, - sctlr_el2, el2_regs->vtcr_el2); + daif = intr_disable(); + + if (in_vhe()) { + WRITE_SPECIALREG(vtcr_el2, el2_regs->vtcr_el2); + } else { + /* + * Install the temporary vectors which will be responsible for + * initializing the VMM when we next trap into EL2. + * + * x0: the exception vector table responsible for hypervisor + * initialization on the next call. + */ + vmm_call_hyp(vtophys(&vmm_hyp_code)); + + /* Create and map the hypervisor stack */ + stack_top = stack_hyp_va[PCPU_GET(cpuid)] + VMM_STACK_SIZE; + + /* Special call to initialize EL2 */ + vmm_call_hyp(vmmpmap_to_ttbr0(), stack_top, el2_regs->tcr_el2, + sctlr_el2, el2_regs->vtcr_el2); + } intr_restore(daif); } @@ -280,10 +284,12 @@ vmmops_modinit(int ipinum) } pa_range_bits = pa_range_field >> ID_AA64MMFR0_PARange_SHIFT; - /* Initialise the EL2 MMU */ - if (!vmmpmap_init()) { - printf("vmm: Failed to init the EL2 MMU\n"); - return (ENOMEM); + if (!in_vhe()) { + /* Initialise the EL2 MMU */ + if (!vmmpmap_init()) { + printf("vmm: Failed to init the EL2 MMU\n"); + return (ENOMEM); + } } /* Set up the stage 2 pmap callbacks */ @@ -292,55 +298,58 @@ vmmops_modinit(int ipinum) pmap_stage2_invalidate_range = vmm_s2_tlbi_range; pmap_stage2_invalidate_all = vmm_s2_tlbi_all; - /* - * Create an allocator for the virtual address space used by EL2. - * EL2 code is identity-mapped; the allocator is used to find space for - * VM structures. - */ - el2_mem_alloc = vmem_create("VMM EL2", 0, 0, PAGE_SIZE, 0, M_WAITOK); - - /* Create the mappings for the hypervisor translation table. */ - hyp_code_len = round_page(&vmm_hyp_code_end - &vmm_hyp_code); - - /* We need an physical identity mapping for when we activate the MMU */ - hyp_code_base = vmm_base = vtophys(&vmm_hyp_code); - rv = vmmpmap_enter(vmm_base, hyp_code_len, vmm_base, - VM_PROT_READ | VM_PROT_EXECUTE); - MPASS(rv); - - next_hyp_va = roundup2(vmm_base + hyp_code_len, L2_SIZE); - - /* Create a per-CPU hypervisor stack */ - CPU_FOREACH(cpu) { - stack[cpu] = malloc(VMM_STACK_SIZE, M_HYP, M_WAITOK | M_ZERO); - stack_hyp_va[cpu] = next_hyp_va; - - for (i = 0; i < VMM_STACK_PAGES; i++) { - rv = vmmpmap_enter(stack_hyp_va[cpu] + ptoa(i), - PAGE_SIZE, vtophys(stack[cpu] + ptoa(i)), - VM_PROT_READ | VM_PROT_WRITE); - MPASS(rv); + if (!in_vhe()) { + /* + * Create an allocator for the virtual address space used by + * EL2. EL2 code is identity-mapped; the allocator is used to + * find space for VM structures. + */ + el2_mem_alloc = vmem_create("VMM EL2", 0, 0, PAGE_SIZE, 0, + M_WAITOK); + + /* Create the mappings for the hypervisor translation table. */ + hyp_code_len = round_page(&vmm_hyp_code_end - &vmm_hyp_code); + + /* We need an physical identity mapping for when we activate the MMU */ + hyp_code_base = vmm_base = vtophys(&vmm_hyp_code); + rv = vmmpmap_enter(vmm_base, hyp_code_len, vmm_base, + VM_PROT_READ | VM_PROT_EXECUTE); + MPASS(rv); + + next_hyp_va = roundup2(vmm_base + hyp_code_len, L2_SIZE); + + /* Create a per-CPU hypervisor stack */ + CPU_FOREACH(cpu) { + stack[cpu] = malloc(VMM_STACK_SIZE, M_HYP, M_WAITOK | M_ZERO); + stack_hyp_va[cpu] = next_hyp_va; + + for (i = 0; i < VMM_STACK_PAGES; i++) { + rv = vmmpmap_enter(stack_hyp_va[cpu] + ptoa(i), + PAGE_SIZE, vtophys(stack[cpu] + ptoa(i)), + VM_PROT_READ | VM_PROT_WRITE); + MPASS(rv); + } + next_hyp_va += L2_SIZE; } - next_hyp_va += L2_SIZE; - } - el2_regs.tcr_el2 = TCR_EL2_RES1; - el2_regs.tcr_el2 |= min(pa_range_bits << TCR_EL2_PS_SHIFT, - TCR_EL2_PS_52BITS); - el2_regs.tcr_el2 |= TCR_EL2_T0SZ(64 - EL2_VIRT_BITS); - el2_regs.tcr_el2 |= TCR_EL2_IRGN0_WBWA | TCR_EL2_ORGN0_WBWA; + el2_regs.tcr_el2 = TCR_EL2_RES1; + el2_regs.tcr_el2 |= min(pa_range_bits << TCR_EL2_PS_SHIFT, + TCR_EL2_PS_52BITS); + el2_regs.tcr_el2 |= TCR_EL2_T0SZ(64 - EL2_VIRT_BITS); + el2_regs.tcr_el2 |= TCR_EL2_IRGN0_WBWA | TCR_EL2_ORGN0_WBWA; #if PAGE_SIZE == PAGE_SIZE_4K - el2_regs.tcr_el2 |= TCR_EL2_TG0_4K; + el2_regs.tcr_el2 |= TCR_EL2_TG0_4K; #elif PAGE_SIZE == PAGE_SIZE_16K - el2_regs.tcr_el2 |= TCR_EL2_TG0_16K; + el2_regs.tcr_el2 |= TCR_EL2_TG0_16K; #else #error Unsupported page size #endif #ifdef SMP - el2_regs.tcr_el2 |= TCR_EL2_SH0_IS; + el2_regs.tcr_el2 |= TCR_EL2_SH0_IS; #endif + } - switch (el2_regs.tcr_el2 & TCR_EL2_PS_MASK) { + switch (pa_range_bits << TCR_EL2_PS_SHIFT) { case TCR_EL2_PS_32BITS: vmm_max_ipa_bits = 32; break; @@ -396,36 +405,37 @@ vmmops_modinit(int ipinum) smp_rendezvous(NULL, arm_setup_vectors, NULL, &el2_regs); - /* Add memory to the vmem allocator (checking there is space) */ - if (vmm_base > (L2_SIZE + PAGE_SIZE)) { - /* - * Ensure there is an L2 block before the vmm code to check - * for buffer overflows on earlier data. Include the PAGE_SIZE - * of the minimum we can allocate. - */ - vmm_base -= L2_SIZE + PAGE_SIZE; - vmm_base = rounddown2(vmm_base, L2_SIZE); + if (!in_vhe()) { + /* Add memory to the vmem allocator (checking there is space) */ + if (vmm_base > (L2_SIZE + PAGE_SIZE)) { + /* + * Ensure there is an L2 block before the vmm code to check + * for buffer overflows on earlier data. Include the PAGE_SIZE + * of the minimum we can allocate. + */ + vmm_base -= L2_SIZE + PAGE_SIZE; + vmm_base = rounddown2(vmm_base, L2_SIZE); + + /* + * Check there is memory before the vmm code to add. + * + * Reserve the L2 block at address 0 so NULL dereference will + * raise an exception. + */ + if (vmm_base > L2_SIZE) + vmem_add(el2_mem_alloc, L2_SIZE, vmm_base - L2_SIZE, + M_WAITOK); + } /* - * Check there is memory before the vmm code to add. - * - * Reserve the L2 block at address 0 so NULL dereference will - * raise an exception. + * Add the memory after the stacks. There is most of an L2 block + * between the last stack and the first allocation so this should + * be safe without adding more padding. */ - if (vmm_base > L2_SIZE) - vmem_add(el2_mem_alloc, L2_SIZE, vmm_base - L2_SIZE, - M_WAITOK); + if (next_hyp_va < HYP_VM_MAX_ADDRESS - PAGE_SIZE) + vmem_add(el2_mem_alloc, next_hyp_va, + HYP_VM_MAX_ADDRESS - next_hyp_va, M_WAITOK); } - - /* - * Add the memory after the stacks. There is most of an L2 block - * between the last stack and the first allocation so this should - * be safe without adding more padding. - */ - if (next_hyp_va < HYP_VM_MAX_ADDRESS - PAGE_SIZE) - vmem_add(el2_mem_alloc, next_hyp_va, - HYP_VM_MAX_ADDRESS - next_hyp_va, M_WAITOK); - cnthctl_el2 = vmm_read_reg(HYP_REG_CNTHCTL); vgic_init(); @@ -439,21 +449,25 @@ vmmops_modcleanup(void) { int cpu; - smp_rendezvous(NULL, arm_teardown_vectors, NULL, NULL); + if (!in_vhe()) { + smp_rendezvous(NULL, arm_teardown_vectors, NULL, NULL); - CPU_FOREACH(cpu) { - vmmpmap_remove(stack_hyp_va[cpu], VMM_STACK_PAGES * PAGE_SIZE, - false); - } + CPU_FOREACH(cpu) { + vmmpmap_remove(stack_hyp_va[cpu], + VMM_STACK_PAGES * PAGE_SIZE, false); + } - vmmpmap_remove(hyp_code_base, hyp_code_len, false); + vmmpmap_remove(hyp_code_base, hyp_code_len, false); + } vtimer_cleanup(); - vmmpmap_fini(); + if (!in_vhe()) { + vmmpmap_fini(); - CPU_FOREACH(cpu) - free(stack[cpu], M_HYP); + CPU_FOREACH(cpu) + free(stack[cpu], M_HYP); + } pmap_clean_stage2_tlbi = NULL; pmap_stage2_invalidate_range = NULL; @@ -505,8 +519,9 @@ vmmops_init(struct vm *vm, pmap_t pmap) vtimer_vminit(hyp); vgic_vminit(hyp); - hyp->el2_addr = el2_map_enter((vm_offset_t)hyp, size, - VM_PROT_READ | VM_PROT_WRITE); + if (!in_vhe()) + hyp->el2_addr = el2_map_enter((vm_offset_t)hyp, size, + VM_PROT_READ | VM_PROT_WRITE); return (hyp); } @@ -534,8 +549,9 @@ vmmops_vcpu_init(void *vmi, struct vcpu *vcpu1, int vcpuid) vtimer_cpuinit(hypctx); vgic_cpuinit(hypctx); - hypctx->el2_addr = el2_map_enter((vm_offset_t)hypctx, size, - VM_PROT_READ | VM_PROT_WRITE); + if (!in_vhe()) + hypctx->el2_addr = el2_map_enter((vm_offset_t)hypctx, size, + VM_PROT_READ | VM_PROT_WRITE); return (hypctx); } @@ -1124,9 +1140,7 @@ vmmops_run(void *vcpui, register_t pc, pmap_t pmap, struct vm_eventinfo *evinfo) vtimer_sync_hwstate(hypctx); /* - * Deactivate the stage2 pmap. vmm_pmap_clean_stage2_tlbi - * depends on this meaning we activate the VM before entering - * the vm again + * Deactivate the stage2 pmap. */ PCPU_SET(curvmpmap, NULL); intr_restore(daif); @@ -1179,7 +1193,8 @@ vmmops_vcpu_cleanup(void *vcpui) vtimer_cpucleanup(hypctx); vgic_cpucleanup(hypctx); - vmmpmap_remove(hypctx->el2_addr, el2_hypctx_size(), true); + if (!in_vhe()) + vmmpmap_remove(hypctx->el2_addr, el2_hypctx_size(), true); free(hypctx, M_HYP); } @@ -1194,7 +1209,8 @@ vmmops_cleanup(void *vmi) smp_rendezvous(NULL, arm_pcpu_vmcleanup, NULL, hyp); - vmmpmap_remove(hyp->el2_addr, el2_hyp_size(hyp->vm), true); + if (!in_vhe()) + vmmpmap_remove(hyp->el2_addr, el2_hyp_size(hyp->vm), true); free(hyp, M_HYP); } diff --git a/sys/arm64/vmm/vmm_reset.c b/sys/arm64/vmm/vmm_reset.c index a929a60c9474..3195bc10dedd 100644 --- a/sys/arm64/vmm/vmm_reset.c +++ b/sys/arm64/vmm/vmm_reset.c @@ -136,6 +136,9 @@ reset_vm_el2_regs(void *vcpu) */ el2ctx->hcr_el2 = HCR_RW | HCR_TID3 | HCR_TWI | HCR_BSU_IS | HCR_FB | HCR_AMO | HCR_IMO | HCR_FMO | HCR_SWIO | HCR_VM; + if (in_vhe()) { + el2ctx->hcr_el2 |= HCR_E2H; + } /* TODO: Trap all extensions we don't support */ el2ctx->mdcr_el2 = 0; @@ -166,7 +169,10 @@ reset_vm_el2_regs(void *vcpu) * Don't trap accesses to CPACR_EL1, trace, SVE, Advanced SIMD * and floating point functionality to EL2. */ - el2ctx->cptr_el2 = CPTR_RES1; + if (in_vhe()) + el2ctx->cptr_el2 = CPACR_FPEN_TRAP_NONE; + else + el2ctx->cptr_el2 = CPTR_RES1; /* * Disable interrupts in the guest. The guest OS will re-enable * them. From 6b17f49da0a2c4cba05e6a1994fb10d16f11fe0a Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Mon, 19 Aug 2024 13:43:51 +0100 Subject: [PATCH 049/145] arm64/vmm: Restore hcr_el2 earlier It may cause fields in other registers to change meaning. Sponsored by: Arm Ltd Differential Revision: https://reviews.freebsd.org/D46077 --- sys/arm64/vmm/vmm_hyp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sys/arm64/vmm/vmm_hyp.c b/sys/arm64/vmm/vmm_hyp.c index 92f8cd25251d..9341f42daef6 100644 --- a/sys/arm64/vmm/vmm_hyp.c +++ b/sys/arm64/vmm/vmm_hyp.c @@ -262,6 +262,9 @@ vmm_hyp_reg_restore(struct hypctx *hypctx, struct hyp *hyp, bool guest) uint64_t dfr0; /* Restore the special registers */ + WRITE_SPECIALREG(hcr_el2, hypctx->hcr_el2); + isb(); + WRITE_SPECIALREG(elr_el1, hypctx->elr_el1); WRITE_SPECIALREG(sp_el0, hypctx->sp_el0); WRITE_SPECIALREG(tpidr_el0, hypctx->tpidr_el0); @@ -290,7 +293,6 @@ vmm_hyp_reg_restore(struct hypctx *hypctx, struct hyp *hyp, bool guest) WRITE_SPECIALREG(spsr_el1, hypctx->spsr_el1); WRITE_SPECIALREG(cptr_el2, hypctx->cptr_el2); - WRITE_SPECIALREG(hcr_el2, hypctx->hcr_el2); WRITE_SPECIALREG(vpidr_el2, hypctx->vpidr_el2); WRITE_SPECIALREG(vmpidr_el2, hypctx->vmpidr_el2); From 7861ecd18b6f15cfe37fc6f51f6ba89b92398648 Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Mon, 19 Aug 2024 13:43:59 +0100 Subject: [PATCH 050/145] arm64/vmm: Teach the switcher about new registers To support booting the kernel in EL2 some of the EL0 and EL1 registers are changed to point to an EL2 version. To get access to the EL0/EL1 version of these registers we need to use the new EL02 and EL12 registers, e.g. to access elr_el1 from the host we would use elr_el12. Add macros that can be different on VHE vs non-VHE code so these registers can be accessed in the correct way. Sponsored by: Arm Ltd Differential Revision: https://reviews.freebsd.org/D46078 --- sys/arm64/vmm/vmm_hyp.c | 89 ++++++++++++++++++++++------------------ sys/arm64/vmm/vmm_nvhe.c | 4 ++ 2 files changed, 54 insertions(+), 39 deletions(-) diff --git a/sys/arm64/vmm/vmm_hyp.c b/sys/arm64/vmm/vmm_hyp.c index 9341f42daef6..ae0c46331382 100644 --- a/sys/arm64/vmm/vmm_hyp.c +++ b/sys/arm64/vmm/vmm_hyp.c @@ -49,11 +49,12 @@ vmm_hyp_reg_store(struct hypctx *hypctx, struct hyp *hyp, bool guest) /* Store the guest VFP registers */ if (guest) { /* Store the timer registers */ - hypctx->vtimer_cpu.cntkctl_el1 = READ_SPECIALREG(cntkctl_el1); + hypctx->vtimer_cpu.cntkctl_el1 = + READ_SPECIALREG(EL1_REG(CNTKCTL)); hypctx->vtimer_cpu.virt_timer.cntx_cval_el0 = - READ_SPECIALREG(cntv_cval_el0); + READ_SPECIALREG(EL0_REG(CNTV_CVAL)); hypctx->vtimer_cpu.virt_timer.cntx_ctl_el0 = - READ_SPECIALREG(cntv_ctl_el0); + READ_SPECIALREG(EL0_REG(CNTV_CTL)); /* Store the GICv3 registers */ hypctx->vgic_v3_regs.ich_eisr_el2 = @@ -222,33 +223,37 @@ vmm_hyp_reg_store(struct hypctx *hypctx, struct hyp *hyp, bool guest) } /* Store the guest special registers */ - hypctx->elr_el1 = READ_SPECIALREG(elr_el1); hypctx->sp_el0 = READ_SPECIALREG(sp_el0); hypctx->tpidr_el0 = READ_SPECIALREG(tpidr_el0); hypctx->tpidrro_el0 = READ_SPECIALREG(tpidrro_el0); hypctx->tpidr_el1 = READ_SPECIALREG(tpidr_el1); - hypctx->vbar_el1 = READ_SPECIALREG(vbar_el1); hypctx->actlr_el1 = READ_SPECIALREG(actlr_el1); - hypctx->afsr0_el1 = READ_SPECIALREG(afsr0_el1); - hypctx->afsr1_el1 = READ_SPECIALREG(afsr1_el1); - hypctx->amair_el1 = READ_SPECIALREG(amair_el1); - hypctx->contextidr_el1 = READ_SPECIALREG(contextidr_el1); - hypctx->cpacr_el1 = READ_SPECIALREG(cpacr_el1); hypctx->csselr_el1 = READ_SPECIALREG(csselr_el1); - hypctx->esr_el1 = READ_SPECIALREG(esr_el1); - hypctx->far_el1 = READ_SPECIALREG(far_el1); - hypctx->mair_el1 = READ_SPECIALREG(mair_el1); hypctx->mdccint_el1 = READ_SPECIALREG(mdccint_el1); hypctx->mdscr_el1 = READ_SPECIALREG(mdscr_el1); hypctx->par_el1 = READ_SPECIALREG(par_el1); - hypctx->sctlr_el1 = READ_SPECIALREG(sctlr_el1); - hypctx->spsr_el1 = READ_SPECIALREG(spsr_el1); - hypctx->tcr_el1 = READ_SPECIALREG(tcr_el1); - /* TODO: Support when this is not res0 */ - hypctx->tcr2_el1 = 0; - hypctx->ttbr0_el1 = READ_SPECIALREG(ttbr0_el1); - hypctx->ttbr1_el1 = READ_SPECIALREG(ttbr1_el1); + + if (guest_or_nonvhe(guest)) { + hypctx->elr_el1 = READ_SPECIALREG(EL1_REG(ELR)); + hypctx->vbar_el1 = READ_SPECIALREG(EL1_REG(VBAR)); + + hypctx->afsr0_el1 = READ_SPECIALREG(EL1_REG(AFSR0)); + hypctx->afsr1_el1 = READ_SPECIALREG(EL1_REG(AFSR1)); + hypctx->amair_el1 = READ_SPECIALREG(EL1_REG(AMAIR)); + hypctx->contextidr_el1 = READ_SPECIALREG(EL1_REG(CONTEXTIDR)); + hypctx->cpacr_el1 = READ_SPECIALREG(EL1_REG(CPACR)); + hypctx->esr_el1 = READ_SPECIALREG(EL1_REG(ESR)); + hypctx->far_el1 = READ_SPECIALREG(EL1_REG(FAR)); + hypctx->mair_el1 = READ_SPECIALREG(EL1_REG(MAIR)); + hypctx->sctlr_el1 = READ_SPECIALREG(EL1_REG(SCTLR)); + hypctx->spsr_el1 = READ_SPECIALREG(EL1_REG(SPSR)); + hypctx->tcr_el1 = READ_SPECIALREG(EL1_REG(TCR)); + /* TODO: Support when this is not res0 */ + hypctx->tcr2_el1 = 0; + hypctx->ttbr0_el1 = READ_SPECIALREG(EL1_REG(TTBR0)); + hypctx->ttbr1_el1 = READ_SPECIALREG(EL1_REG(TTBR1)); + } hypctx->cptr_el2 = READ_SPECIALREG(cptr_el2); hypctx->hcr_el2 = READ_SPECIALREG(hcr_el2); @@ -265,32 +270,37 @@ vmm_hyp_reg_restore(struct hypctx *hypctx, struct hyp *hyp, bool guest) WRITE_SPECIALREG(hcr_el2, hypctx->hcr_el2); isb(); - WRITE_SPECIALREG(elr_el1, hypctx->elr_el1); WRITE_SPECIALREG(sp_el0, hypctx->sp_el0); WRITE_SPECIALREG(tpidr_el0, hypctx->tpidr_el0); WRITE_SPECIALREG(tpidrro_el0, hypctx->tpidrro_el0); WRITE_SPECIALREG(tpidr_el1, hypctx->tpidr_el1); - WRITE_SPECIALREG(vbar_el1, hypctx->vbar_el1); WRITE_SPECIALREG(actlr_el1, hypctx->actlr_el1); - WRITE_SPECIALREG(afsr0_el1, hypctx->afsr0_el1); - WRITE_SPECIALREG(afsr1_el1, hypctx->afsr1_el1); - WRITE_SPECIALREG(amair_el1, hypctx->amair_el1); - WRITE_SPECIALREG(contextidr_el1, hypctx->contextidr_el1); - WRITE_SPECIALREG(cpacr_el1, hypctx->cpacr_el1); WRITE_SPECIALREG(csselr_el1, hypctx->csselr_el1); - WRITE_SPECIALREG(esr_el1, hypctx->esr_el1); - WRITE_SPECIALREG(far_el1, hypctx->far_el1); WRITE_SPECIALREG(mdccint_el1, hypctx->mdccint_el1); WRITE_SPECIALREG(mdscr_el1, hypctx->mdscr_el1); - WRITE_SPECIALREG(mair_el1, hypctx->mair_el1); WRITE_SPECIALREG(par_el1, hypctx->par_el1); - WRITE_SPECIALREG(sctlr_el1, hypctx->sctlr_el1); - WRITE_SPECIALREG(tcr_el1, hypctx->tcr_el1); - /* TODO: tcr2_el1 */ - WRITE_SPECIALREG(ttbr0_el1, hypctx->ttbr0_el1); - WRITE_SPECIALREG(ttbr1_el1, hypctx->ttbr1_el1); - WRITE_SPECIALREG(spsr_el1, hypctx->spsr_el1); + + if (guest_or_nonvhe(guest)) { + WRITE_SPECIALREG(EL1_REG(ELR), hypctx->elr_el1); + WRITE_SPECIALREG(EL1_REG(VBAR), hypctx->vbar_el1); + + WRITE_SPECIALREG(EL1_REG(AFSR0), hypctx->afsr0_el1); + WRITE_SPECIALREG(EL1_REG(AFSR1), hypctx->afsr1_el1); + WRITE_SPECIALREG(EL1_REG(AMAIR), hypctx->amair_el1); + WRITE_SPECIALREG(EL1_REG(CONTEXTIDR), hypctx->contextidr_el1); + WRITE_SPECIALREG(EL1_REG(CPACR), hypctx->cpacr_el1); + WRITE_SPECIALREG(EL1_REG(ESR), hypctx->esr_el1); + WRITE_SPECIALREG(EL1_REG(FAR), hypctx->far_el1); + WRITE_SPECIALREG(EL1_REG(MAIR), hypctx->mair_el1); // + + WRITE_SPECIALREG(EL1_REG(SCTLR), hypctx->sctlr_el1); + WRITE_SPECIALREG(EL1_REG(SPSR), hypctx->spsr_el1); + WRITE_SPECIALREG(EL1_REG(TCR), hypctx->tcr_el1); + /* TODO: tcr2_el1 */ + WRITE_SPECIALREG(EL1_REG(TTBR0), hypctx->ttbr0_el1); + WRITE_SPECIALREG(EL1_REG(TTBR1), hypctx->ttbr1_el1); + } WRITE_SPECIALREG(cptr_el2, hypctx->cptr_el2); WRITE_SPECIALREG(vpidr_el2, hypctx->vpidr_el2); @@ -413,10 +423,11 @@ vmm_hyp_reg_restore(struct hypctx *hypctx, struct hyp *hyp, bool guest) if (guest) { /* Load the timer registers */ - WRITE_SPECIALREG(cntkctl_el1, hypctx->vtimer_cpu.cntkctl_el1); - WRITE_SPECIALREG(cntv_cval_el0, + WRITE_SPECIALREG(EL1_REG(CNTKCTL), + hypctx->vtimer_cpu.cntkctl_el1); + WRITE_SPECIALREG(EL0_REG(CNTV_CVAL), hypctx->vtimer_cpu.virt_timer.cntx_cval_el0); - WRITE_SPECIALREG(cntv_ctl_el0, + WRITE_SPECIALREG(EL0_REG(CNTV_CTL), hypctx->vtimer_cpu.virt_timer.cntx_ctl_el0); WRITE_SPECIALREG(cnthctl_el2, hyp->vtimer.cnthctl_el2); WRITE_SPECIALREG(cntvoff_el2, hyp->vtimer.cntvoff_el2); diff --git a/sys/arm64/vmm/vmm_nvhe.c b/sys/arm64/vmm/vmm_nvhe.c index 1127a6e37781..025b1308ce68 100644 --- a/sys/arm64/vmm/vmm_nvhe.c +++ b/sys/arm64/vmm/vmm_nvhe.c @@ -32,6 +32,10 @@ #define VMM_STATIC static #define VMM_HYP_FUNC(func) vmm_nvhe_ ## func +#define guest_or_nonvhe(guest) (true) +#define EL1_REG(reg) MRS_REG_ALT_NAME(reg ## _EL1) +#define EL0_REG(reg) MRS_REG_ALT_NAME(reg ## _EL0) + #include "vmm_hyp.c" uint64_t vmm_hyp_enter(uint64_t, uint64_t, uint64_t, uint64_t, uint64_t, From 20eabb33b4645bc088c40d6475d5fa628b39031f Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Mon, 19 Aug 2024 13:44:05 +0100 Subject: [PATCH 051/145] arm64/vmm: Only store the guest par_el1 There is no need to store the host par_el1. We don't depend on it not changing across calls into a guest. Sponsored by: Arm Ltd Differential Revision: https://reviews.freebsd.org/D46079 --- sys/arm64/vmm/vmm_hyp.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sys/arm64/vmm/vmm_hyp.c b/sys/arm64/vmm/vmm_hyp.c index ae0c46331382..9c84bb4b294e 100644 --- a/sys/arm64/vmm/vmm_hyp.c +++ b/sys/arm64/vmm/vmm_hyp.c @@ -220,6 +220,7 @@ vmm_hyp_reg_store(struct hypctx *hypctx, struct hyp *hyp, bool guest) hypctx->tf.tf_spsr = READ_SPECIALREG(spsr_el2); if (guest) { hypctx->tf.tf_esr = READ_SPECIALREG(esr_el2); + hypctx->par_el1 = READ_SPECIALREG(par_el1); } /* Store the guest special registers */ @@ -232,7 +233,6 @@ vmm_hyp_reg_store(struct hypctx *hypctx, struct hyp *hyp, bool guest) hypctx->csselr_el1 = READ_SPECIALREG(csselr_el1); hypctx->mdccint_el1 = READ_SPECIALREG(mdccint_el1); hypctx->mdscr_el1 = READ_SPECIALREG(mdscr_el1); - hypctx->par_el1 = READ_SPECIALREG(par_el1); if (guest_or_nonvhe(guest)) { hypctx->elr_el1 = READ_SPECIALREG(EL1_REG(ELR)); @@ -279,7 +279,6 @@ vmm_hyp_reg_restore(struct hypctx *hypctx, struct hyp *hyp, bool guest) WRITE_SPECIALREG(csselr_el1, hypctx->csselr_el1); WRITE_SPECIALREG(mdccint_el1, hypctx->mdccint_el1); WRITE_SPECIALREG(mdscr_el1, hypctx->mdscr_el1); - WRITE_SPECIALREG(par_el1, hypctx->par_el1); if (guest_or_nonvhe(guest)) { WRITE_SPECIALREG(EL1_REG(ELR), hypctx->elr_el1); @@ -302,6 +301,10 @@ vmm_hyp_reg_restore(struct hypctx *hypctx, struct hyp *hyp, bool guest) WRITE_SPECIALREG(EL1_REG(TTBR1), hypctx->ttbr1_el1); } + if (guest) { + WRITE_SPECIALREG(par_el1, hypctx->par_el1); + } + WRITE_SPECIALREG(cptr_el2, hypctx->cptr_el2); WRITE_SPECIALREG(vpidr_el2, hypctx->vpidr_el2); WRITE_SPECIALREG(vmpidr_el2, hypctx->vmpidr_el2); From 5577bb2f67ff442a92a3c0edb133e03bc28bb9d6 Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Mon, 19 Aug 2024 13:44:11 +0100 Subject: [PATCH 052/145] arm64/vmm: Support tlbi from VHE When invalidating the stage 2 TLB we need to ensure page tables updates have completed and for tlbi vmalle1is the HCR_EL2 TGE flag needs to be clear. To fix the former add a data barrier before the tlbi instructions. On non-VHE this will happen as part of the exception entry, so is only needed for VHE. The tlbi vmalle1is instruction operates on the EL2 & 0 regime when HCR_EL2 E2H and TGE flags are both set. By clearing the TGE flag it will stop this and operate on the EL1 & 0 regime we are expecting. Sponsored by: Arm Ltd Differential Revision: https://reviews.freebsd.org/D46080 --- sys/arm64/vmm/vmm_hyp.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/sys/arm64/vmm/vmm_hyp.c b/sys/arm64/vmm/vmm_hyp.c index 9c84bb4b294e..bd119c80139b 100644 --- a/sys/arm64/vmm/vmm_hyp.c +++ b/sys/arm64/vmm/vmm_hyp.c @@ -619,6 +619,13 @@ VMM_HYP_FUNC(s2_tlbi_range)(uint64_t vttbr, vm_offset_t sva, vm_offset_t eva, { uint64_t end, r, start; uint64_t host_vttbr; +#ifdef VMM_VHE + uint64_t host_tcr; +#endif + +#ifdef VMM_VHE + dsb(ishst); +#endif #define TLBI_VA_SHIFT 12 #define TLBI_VA_MASK ((1ul << 44) - 1) @@ -631,6 +638,12 @@ VMM_HYP_FUNC(s2_tlbi_range)(uint64_t vttbr, vm_offset_t sva, vm_offset_t eva, WRITE_SPECIALREG(vttbr_el2, vttbr); isb(); +#ifdef VMM_VHE + host_tcr = READ_SPECIALREG(tcr_el2); + WRITE_SPECIALREG(tcr_el2, host_tcr & ~HCR_TGE); + isb(); +#endif + /* * The CPU can cache the stage 1 + 2 combination so we need to ensure * the stage 2 is invalidated first, then when this has completed we @@ -655,7 +668,12 @@ VMM_HYP_FUNC(s2_tlbi_range)(uint64_t vttbr, vm_offset_t sva, vm_offset_t eva, dsb(ish); isb(); - /* Switch back t othe host vttbr */ +#ifdef VMM_VHE + WRITE_SPECIALREG(tcr_el2, host_tcr); + isb(); +#endif + + /* Switch back to the host vttbr */ WRITE_SPECIALREG(vttbr_el2, host_vttbr); isb(); } @@ -665,6 +683,10 @@ VMM_HYP_FUNC(s2_tlbi_all)(uint64_t vttbr) { uint64_t host_vttbr; +#ifdef VMM_VHE + dsb(ishst); +#endif + /* Switch to the guest vttbr */ /* TODO: Handle Cortex-A57/A72 erratum 131936 */ host_vttbr = READ_SPECIALREG(vttbr_el2); From f83a0f38d14e26f5f7f31050786e9190cd131902 Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Mon, 19 Aug 2024 13:44:18 +0100 Subject: [PATCH 053/145] arm64/vmm: Update exception vectors around a guest When entering into a guest with VHE we need to switch from the kernel exception vectors to the vmm exception vectors. The latter understands an exception will be from a guest and can switch back to a kernel context. Rather than encoding the location of the kernel vectors we can just read the value from vbar_el2 and restore it later. Sponsored by: Arm Ltd Differential Revision: https://reviews.freebsd.org/D46081 --- sys/arm64/vmm/vmm_hyp_exception.S | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/sys/arm64/vmm/vmm_hyp_exception.S b/sys/arm64/vmm/vmm_hyp_exception.S index 9a1648130f49..cd2b94f1ff0b 100644 --- a/sys/arm64/vmm/vmm_hyp_exception.S +++ b/sys/arm64/vmm/vmm_hyp_exception.S @@ -175,6 +175,10 @@ hyp_vectors: /* Restore host VTTBR */ mov x9, #VTTBR_HOST msr vttbr_el2, x9 + +#ifdef VMM_VHE + msr vbar_el1, x1 +#endif .endm @@ -288,6 +292,14 @@ LEND(handle_el2_el1_error64) * x0 - hypctx address */ ENTRY(VMM_HYP_FUNC(do_call_guest)) +#ifdef VMM_VHE + mrs x1, vbar_el1 + adrp x2, hyp_vectors + add x2, x2, :lo12:hyp_vectors + msr vbar_el1, x2 + isb +#endif + /* Save hypctx address */ msr tpidr_el2, x0 From 79a19e0e8d8d1fa71092e542671213bfe99b024e Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Mon, 19 Aug 2024 13:44:24 +0100 Subject: [PATCH 054/145] arm64/vmm: Hide non-VHE exception code from VHE Remove the non-VHE exception code from the VHE code path. As we replace the exception vectors when entering the guest we don't need to check which context we are in so can skip parts of the exception vectors. Sponsored by: Arm Ltd Differential Revision: https://reviews.freebsd.org/D46082 --- sys/arm64/vmm/vmm_hyp_exception.S | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/sys/arm64/vmm/vmm_hyp_exception.S b/sys/arm64/vmm/vmm_hyp_exception.S index cd2b94f1ff0b..95b0e20f37ed 100644 --- a/sys/arm64/vmm/vmm_hyp_exception.S +++ b/sys/arm64/vmm/vmm_hyp_exception.S @@ -183,6 +183,7 @@ hyp_vectors: .macro handle_el2_excp type +#ifndef VMM_VHE /* Save registers before modifying so we can restore them */ str x9, [sp, #-16]! @@ -193,15 +194,18 @@ hyp_vectors: /* We got the exception while the guest was running */ ldr x9, [sp], #16 +#endif /* !VMM_VHE */ do_world_switch_to_host mov x0, \type ret +#ifndef VMM_VHE 1: /* We got the exception while the host was running */ ldr x9, [sp], #16 mov x0, \type ERET +#endif /* !VMM_VHE */ .endm @@ -223,6 +227,7 @@ LEND(handle_el2_el2h_error) LENTRY(handle_el2_el1_sync64) +#ifndef VMM_VHE /* Save registers before modifying so we can restore them */ str x9, [sp, #-16]! @@ -245,7 +250,9 @@ LENTRY(handle_el2_el1_sync64) ldr lr, [sp], #16 ERET -1: /* Guest exception taken to EL2 */ +1: +#endif + /* Guest exception taken to EL2 */ do_world_switch_to_host mov x0, #EXCP_TYPE_EL1_SYNC ret From b87952e837ec1e42a573557a8e938693d8952705 Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Mon, 19 Aug 2024 14:28:39 +0100 Subject: [PATCH 055/145] arm64/vmm: Mark asm files as supporting BTI These already support BTI as they use the ENTRY macro. While the non-VHE code doesn't need this the new VHE code will need it as it is linked into either the kernel or the vmm module so will be included in the BTI check. Sponsored by: Arm Ltd --- sys/arm64/vmm/vmm_hyp_exception.S | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sys/arm64/vmm/vmm_hyp_exception.S b/sys/arm64/vmm/vmm_hyp_exception.S index 95b0e20f37ed..50c2490f37bf 100644 --- a/sys/arm64/vmm/vmm_hyp_exception.S +++ b/sys/arm64/vmm/vmm_hyp_exception.S @@ -30,6 +30,7 @@ */ +#include #include #include @@ -316,3 +317,5 @@ ENTRY(VMM_HYP_FUNC(do_call_guest)) /* Enter guest */ ERET END(VMM_HYP_FUNC(do_call_guest)) + +GNU_PROPERTY_AARCH64_FEATURE_1_NOTE(GNU_PROPERTY_AARCH64_FEATURE_1_VAL) From bbe97db3c211bd0a89a5b9c1a58857625b763475 Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Mon, 19 Aug 2024 13:44:31 +0100 Subject: [PATCH 056/145] arm64/vmm: Add the VHE exception and switcher files These just need to include the common code with macros to ensure it is built correctly. Sponsored by: Arm Ltd Differential Revision: https://reviews.freebsd.org/D46083 --- sys/arm64/vmm/vmm_vhe.c | 39 +++++++++++++++++++++++++++++++ sys/arm64/vmm/vmm_vhe_exception.S | 31 ++++++++++++++++++++++++ sys/conf/files.arm64 | 2 ++ sys/modules/vmm/Makefile | 2 ++ 4 files changed, 74 insertions(+) create mode 100644 sys/arm64/vmm/vmm_vhe.c create mode 100644 sys/arm64/vmm/vmm_vhe_exception.S diff --git a/sys/arm64/vmm/vmm_vhe.c b/sys/arm64/vmm/vmm_vhe.c new file mode 100644 index 000000000000..8a12852e2a7a --- /dev/null +++ b/sys/arm64/vmm/vmm_vhe.c @@ -0,0 +1,39 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2024 Arm Ltd + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "vmm_handlers.h" + +#define VMM_VHE + +#define VMM_STATIC +#define VMM_HYP_FUNC(func) vmm_vhe_ ## func + +#define guest_or_nonvhe(guest) (guest) +#define EL1_REG(reg) MRS_REG_ALT_NAME(reg ## _EL12) +#define EL0_REG(reg) MRS_REG_ALT_NAME(reg ## _EL02) + +#include "vmm_hyp.c" diff --git a/sys/arm64/vmm/vmm_vhe_exception.S b/sys/arm64/vmm/vmm_vhe_exception.S new file mode 100644 index 000000000000..286f5df03707 --- /dev/null +++ b/sys/arm64/vmm/vmm_vhe_exception.S @@ -0,0 +1,31 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2024 Arm Ltd + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#define VMM_VHE +#define VMM_HYP_FUNC(func) vmm_vhe_ ## func + +#include "vmm_hyp_exception.S" diff --git a/sys/conf/files.arm64 b/sys/conf/files.arm64 index b522177221e5..b105ce873d24 100644 --- a/sys/conf/files.arm64 +++ b/sys/conf/files.arm64 @@ -145,6 +145,8 @@ vmm_hyp_blob.bin optional vmm \ arm64/vmm/vmm_hyp_el2.S optional vmm \ dependency vmm_hyp_blob.bin arm64/vmm/vmm_mmu.c optional vmm +arm64/vmm/vmm_vhe.c optional vmm +arm64/vmm/vmm_vhe_exception.S optional vmm arm64/vmm/io/vgic.c optional vmm arm64/vmm/io/vgic_v3.c optional vmm arm64/vmm/io/vgic_if.m optional vmm diff --git a/sys/modules/vmm/Makefile b/sys/modules/vmm/Makefile index a3a878a653ff..0604a34690e5 100644 --- a/sys/modules/vmm/Makefile +++ b/sys/modules/vmm/Makefile @@ -30,6 +30,8 @@ SRCS+= vmm_arm64.c \ vmm_call.S \ vmm_handlers.c \ vmm_mmu.c \ + vmm_vhe_exception.S \ + vmm_vhe.c \ vmm_hyp_el2.S .PATH: ${SRCTOP}/sys/${MACHINE}/vmm/io From 7279fa6af13d1c370f9bcb293562c694090d849d Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Mon, 19 Aug 2024 13:45:54 +0100 Subject: [PATCH 057/145] arm64/vmm: Convert the handlers into ifuncs Now we have support for both VHE and non-VHE update the handlers to use an ifunc to decide which version to use. Sponsored by: Arm Ltd Differential Revision: https://reviews.freebsd.org/D46084 --- sys/arm64/vmm/vmm_handlers.c | 68 ++++++++++++++++++++++++++++++------ 1 file changed, 58 insertions(+), 10 deletions(-) diff --git a/sys/arm64/vmm/vmm_handlers.c b/sys/arm64/vmm/vmm_handlers.c index 2ce674d5ba46..c567b585eb06 100644 --- a/sys/arm64/vmm/vmm_handlers.c +++ b/sys/arm64/vmm/vmm_handlers.c @@ -33,33 +33,81 @@ #include "arm64.h" #include "vmm_handlers.h" -uint64_t -vmm_read_reg(uint64_t reg) +/* Read an EL2 register */ +static uint64_t +vmm_nvhe_read_reg(uint64_t reg) { return (vmm_call_hyp(HYP_READ_REGISTER, reg)); } -uint64_t -vmm_enter_guest(struct hyp *hyp, struct hypctx *hypctx) +DEFINE_IFUNC(, uint64_t, vmm_read_reg, (uint64_t reg)) +{ + if (in_vhe()) + return (vmm_vhe_read_reg); + return (vmm_nvhe_read_reg); +} + +/* Enter the guest */ +static uint64_t +vmm_nvhe_enter_guest(struct hyp *hyp, struct hypctx *hypctx) { return (vmm_call_hyp(HYP_ENTER_GUEST, hyp->el2_addr, hypctx->el2_addr)); } -void -vmm_clean_s2_tlbi(void) +DEFINE_IFUNC(, uint64_t, vmm_enter_guest, + (struct hyp *hyp, struct hypctx *hypctx)) +{ + if (in_vhe()) + return (vmm_vhe_enter_guest); + return (vmm_nvhe_enter_guest); +} + +/* Clean the TLB for all guests */ +static void +vmm_nvhe_clean_s2_tlbi(void) { vmm_call_hyp(HYP_CLEAN_S2_TLBI); } -void -vmm_s2_tlbi_range(uint64_t vttbr, vm_offset_t sva, vm_offset_t eva, +DEFINE_IFUNC(, void, vmm_clean_s2_tlbi, (void)) +{ + if (in_vhe()) + return (vmm_vhe_clean_s2_tlbi); + return (vmm_nvhe_clean_s2_tlbi); +} + +/* + * Switch to a guest vttbr and clean the TLB for a range of guest + * virtual address space. + */ +static void +vmm_nvhe_s2_tlbi_range(uint64_t vttbr, vm_offset_t sva, vm_offset_t eva, bool final_only) { vmm_call_hyp(HYP_S2_TLBI_RANGE, vttbr, sva, eva, final_only); } -void -vmm_s2_tlbi_all(uint64_t vttbr) +DEFINE_IFUNC(, void, vmm_s2_tlbi_range, + (uint64_t vttbr, vm_offset_t sva, vm_offset_t eva, bool final_only)) +{ + if (in_vhe()) + return (vmm_vhe_s2_tlbi_range); + return (vmm_nvhe_s2_tlbi_range); +} + +/* + * Switch to a guest vttbr and clean the TLB for all the guest + * virtual address space. + */ +static void +vmm_nvhe_s2_tlbi_all(uint64_t vttbr) { vmm_call_hyp(HYP_S2_TLBI_ALL, vttbr); } + +DEFINE_IFUNC(, void, vmm_s2_tlbi_all, (uint64_t vttbr)) +{ + if (in_vhe()) + return (vmm_vhe_s2_tlbi_all); + return (vmm_nvhe_s2_tlbi_all); +} From 53b8812c9a997ef7e8879484890ef688bca4acdf Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Mon, 19 Aug 2024 13:46:00 +0100 Subject: [PATCH 058/145] arm64/vmm: Allow vmm when in VHE We now support VHE in byve so there is no reason to block it. Sponsored by: Arm Ltd Differential Revision: https://reviews.freebsd.org/D46085 --- sys/arm64/vmm/vmm_arm64.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/sys/arm64/vmm/vmm_arm64.c b/sys/arm64/vmm/vmm_arm64.c index 3079353668e3..164ff65cfe2c 100644 --- a/sys/arm64/vmm/vmm_arm64.c +++ b/sys/arm64/vmm/vmm_arm64.c @@ -246,12 +246,6 @@ vmmops_modinit(int ipinum) return (ENXIO); } - /* TODO: Support VHE */ - if (in_vhe()) { - printf("vmm: VHE is unsupported\n"); - return (ENXIO); - } - if (!vgic_present()) { printf("vmm: No vgic found\n"); return (ENODEV); From 19fb9ad746517c7af9d79a982334b2550f285355 Mon Sep 17 00:00:00 2001 From: Baptiste Daroussin Date: Tue, 20 Aug 2024 12:04:01 +0200 Subject: [PATCH 059/145] nuageinit: readd ssh key parsing when key is in meta_data.json in openstack when no user is specified but a sshkey is provided the information is stored in meta_data.json under "public_keys" PR: 280461 Reported by: tdb --- libexec/nuageinit/nuageinit | 6 ++++ libexec/nuageinit/tests/nuageinit.sh | 48 ++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+) diff --git a/libexec/nuageinit/nuageinit b/libexec/nuageinit/nuageinit index f268f9b0f52c..622e294bb531 100755 --- a/libexec/nuageinit/nuageinit +++ b/libexec/nuageinit/nuageinit @@ -164,6 +164,12 @@ if citype == "config-2" then nuage.err("nuageinit: error parsing config-2: meta_data.json: " .. err) end local obj = parser:get_object() + if obj.public_keys then + local homedir = nuage.adduser(default_user) + for _,v in pairs(obj.public_keys) do + nuage.addsshkey(homedir, v) + end + end nuage.sethostname(obj["hostname"]) -- network diff --git a/libexec/nuageinit/tests/nuageinit.sh b/libexec/nuageinit/tests/nuageinit.sh index c6a86bc15486..b5078e256853 100644 --- a/libexec/nuageinit/tests/nuageinit.sh +++ b/libexec/nuageinit/tests/nuageinit.sh @@ -8,6 +8,7 @@ atf_test_case nocloud_network atf_test_case config2 atf_test_case config2_pubkeys atf_test_case config2_pubkeys_user_data +atf_test_case config2_pubkeys_meta_data atf_test_case config2_network atf_test_case config2_network_static_v4 @@ -242,6 +243,52 @@ EOF atf_check -o inline:"ssh-rsa AAAAB3NzaC1y...== Generated by Nova\n" cat home/freebsd/.ssh/authorized_keys } +config2_pubkeys_meta_data_body() +{ + here=$(pwd) + export NUAGE_FAKE_ROOTDIR=$(pwd) + if [ $(id -u) -ne 0 ]; then + atf_skip "root required" + fi + mkdir -p media/nuageinit + cat > media/nuageinit/meta_data.json < etc/master.passwd < etc/group < Date: Tue, 20 Aug 2024 12:05:25 +0200 Subject: [PATCH 060/145] nuageinit: improve debugging when mkdir fails --- libexec/nuageinit/nuage.lua | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/libexec/nuageinit/nuage.lua b/libexec/nuageinit/nuage.lua index cca1fe9b4678..116ab143ccfa 100644 --- a/libexec/nuageinit/nuage.lua +++ b/libexec/nuageinit/nuage.lua @@ -188,10 +188,7 @@ local function addsshkey(homedir, key) chownak = true dirattrs = lfs.attributes(dotssh_path) if dirattrs == nil then - if not lfs.mkdir(dotssh_path) then - warnmsg("nuageinit: impossible to create ".. dotssh_path) - return - end + assert(lfs.mkdir(dotssh_path)) chowndotssh = true dirattrs = lfs.attributes(homedir) end From 0a5996443b61861d6658ac216699b6717f05930d Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Tue, 20 Aug 2024 07:54:12 -0400 Subject: [PATCH 061/145] src.conf.5: Fix spelling typo Sponsored by: AFRL, DARPA --- share/man/man5/src.conf.5 | 4 ++-- tools/build/options/WITH_UNDEFINED_VERSION | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/share/man/man5/src.conf.5 b/share/man/man5/src.conf.5 index 200ce8211e17..afdf3ee51619 100644 --- a/share/man/man5/src.conf.5 +++ b/share/man/man5/src.conf.5 @@ -1,5 +1,5 @@ .\" DO NOT EDIT-- this file is @generated by tools/build/options/makeman. -.Dd August 6, 2024 +.Dd August 20, 2024 .Dt SRC.CONF 5 .Os .Sh NAME @@ -1759,7 +1759,7 @@ and related programs. .It Va WITH_UNDEFINED_VERSION Link libraries with --undefined-version which permits version maps to contain symbols that are not present in the library. -If this is necessicary to build a particular configuration, a bug is +If this is necessary to build a particular configuration, a bug is present and the configuration should be reported. .It Va WITHOUT_UNIFIED_OBJDIR Use the historical object directory format for diff --git a/tools/build/options/WITH_UNDEFINED_VERSION b/tools/build/options/WITH_UNDEFINED_VERSION index 71b048349a6f..99f687d37bcf 100644 --- a/tools/build/options/WITH_UNDEFINED_VERSION +++ b/tools/build/options/WITH_UNDEFINED_VERSION @@ -1,4 +1,4 @@ Link libraries with --undefined-version which permits version maps to contain symbols that are not present in the library. -If this is necessicary to build a particular configuration, a bug is +If this is necessary to build a particular configuration, a bug is present and the configuration should be reported. From 7d12558904ca5b6f830a4ba9f7b1f8ba119bc5e3 Mon Sep 17 00:00:00 2001 From: Igor Ostapenko Date: Tue, 20 Aug 2024 12:02:56 +0200 Subject: [PATCH 062/145] pf: Make pf_test predict that m_len < sizeof(struct ip) is false Reviewed by: kp Differential Revision: https://reviews.freebsd.org/D46374 --- sys/netpfil/pf/pf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sys/netpfil/pf/pf.c b/sys/netpfil/pf/pf.c index 9b1601ac0ee5..ad2dc2e707ed 100644 --- a/sys/netpfil/pf/pf.c +++ b/sys/netpfil/pf/pf.c @@ -8354,7 +8354,7 @@ pf_test(int dir, int pflags, struct ifnet *ifp, struct mbuf **m0, pd.af = AF_INET; pd.act.rtableid = -1; - if (m->m_len < sizeof(struct ip) && + if (__predict_false(m->m_len < sizeof(struct ip)) && (m = *m0 = m_pullup(*m0, sizeof(struct ip))) == NULL) { DPFPRINTF(PF_DEBUG_URGENT, ("pf_test: m_len < sizeof(struct ip), pullup failed\n")); From 2787f8c39c60f7510f9bc04ec46fe0b37ece0e3e Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Thu, 1 Aug 2024 01:13:33 +0300 Subject: [PATCH 063/145] mlx5en: stop including mlx5_accel/ipsec.h from en.h This creates a circular dependency preventing inline functions from ipsec.h from using en.h definitions. Sponsored by: NVidia networking --- sys/dev/mlx5/mlx5_en/en.h | 1 - sys/dev/mlx5/mlx5_en/mlx5_en_flow_table.c | 1 + sys/dev/mlx5/mlx5_en/mlx5_en_main.c | 1 + 3 files changed, 2 insertions(+), 1 deletion(-) diff --git a/sys/dev/mlx5/mlx5_en/en.h b/sys/dev/mlx5/mlx5_en/en.h index cdc8caa838d6..8966aeacb890 100644 --- a/sys/dev/mlx5/mlx5_en/en.h +++ b/sys/dev/mlx5/mlx5_en/en.h @@ -70,7 +70,6 @@ #include #include #include -#include #define MLX5_SET_CFG(p, f, v) MLX5_SET(create_flow_group_in, p, f, v) diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_flow_table.c b/sys/dev/mlx5/mlx5_en/mlx5_en_flow_table.c index 1601557e52cc..ac275b5b145c 100644 --- a/sys/dev/mlx5/mlx5_en/mlx5_en_flow_table.c +++ b/sys/dev/mlx5/mlx5_en/mlx5_en_flow_table.c @@ -27,6 +27,7 @@ #include "opt_ratelimit.h" #include +#include #include #include diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_main.c b/sys/dev/mlx5/mlx5_en/mlx5_en_main.c index a80235f0f347..acb9f72b15c3 100644 --- a/sys/dev/mlx5/mlx5_en/mlx5_en_main.c +++ b/sys/dev/mlx5/mlx5_en/mlx5_en_main.c @@ -30,6 +30,7 @@ #include "opt_ratelimit.h" #include +#include #include #include From d00f3505efad7c323f74bee63f7d2527daf0f534 Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Thu, 1 Aug 2024 01:23:09 +0300 Subject: [PATCH 064/145] mlx5en: do now waste ipsec_accel_in_tag on non-ipsec packets rx Do not prepend ipsec tags into mbuf head when preparing rx wqe, store it separately. Only prepend (and clear the store) when received packed was indeed offloaded by ipsec engine. Then we do not need to refill tags for slots that received non-ipsec packets. This should solve some minimal degradation of the rx CPU usage due to unneeded tag allocation for each packet. Sponsored by: NVidia networking --- sys/dev/mlx5/mlx5_accel/ipsec.h | 23 ++++++------- sys/dev/mlx5/mlx5_accel/mlx5_ipsec_rxtx.c | 39 +++++++++++++---------- sys/dev/mlx5/mlx5_en/en.h | 3 ++ sys/dev/mlx5/mlx5_en/mlx5_en_main.c | 2 ++ sys/dev/mlx5/mlx5_en/mlx5_en_rx.c | 12 +++---- 5 files changed, 42 insertions(+), 37 deletions(-) diff --git a/sys/dev/mlx5/mlx5_accel/ipsec.h b/sys/dev/mlx5/mlx5_accel/ipsec.h index 1658542fc9c6..2abd68d9770a 100644 --- a/sys/dev/mlx5/mlx5_accel/ipsec.h +++ b/sys/dev/mlx5/mlx5_accel/ipsec.h @@ -251,27 +251,22 @@ void mlx5e_accel_ipsec_fs_rx_tables_destroy(struct mlx5e_priv *priv); int mlx5e_accel_ipsec_fs_rx_tables_create(struct mlx5e_priv *priv); void mlx5e_accel_ipsec_fs_rx_catchall_rules_destroy(struct mlx5e_priv *priv); int mlx5e_accel_ipsec_fs_rx_catchall_rules(struct mlx5e_priv *priv); -int mlx5_accel_ipsec_rx_tag_add(if_t ifp, struct mbuf *mb); -int mlx5e_accel_ipsec_handle_rx_cqe(struct mbuf *mb, struct mlx5_cqe64 *cqe); +int mlx5_accel_ipsec_rx_tag_add(if_t ifp, struct mlx5e_rq_mbuf *mr); +void mlx5e_accel_ipsec_handle_rx_cqe(struct mbuf *mb, struct mlx5_cqe64 *cqe, + struct mlx5e_rq_mbuf *mr); + static inline int mlx5e_accel_ipsec_flow(struct mlx5_cqe64 *cqe) { return MLX5_IPSEC_METADATA_MARKER(be32_to_cpu(cqe->ft_metadata)); } -static inline void mlx5e_accel_ipsec_handle_rx(struct mbuf *mb, struct mlx5_cqe64 *cqe) +static inline void +mlx5e_accel_ipsec_handle_rx(struct mbuf *mb, struct mlx5_cqe64 *cqe, + struct mlx5e_rq_mbuf *mr) { u32 ipsec_meta_data = be32_to_cpu(cqe->ft_metadata); - if (!MLX5_IPSEC_METADATA_MARKER(ipsec_meta_data)) { - struct m_tag *mtag; - - mtag = m_tag_find(mb, PACKET_TAG_IPSEC_ACCEL_IN, NULL); - if (mtag != NULL) - m_tag_delete(mb, mtag); - - return; - } - - mlx5e_accel_ipsec_handle_rx_cqe(mb, cqe); + if (MLX5_IPSEC_METADATA_MARKER(ipsec_meta_data)) + mlx5e_accel_ipsec_handle_rx_cqe(mb, cqe, mr); } #endif /* __MLX5_ACCEL_IPSEC_H__ */ diff --git a/sys/dev/mlx5/mlx5_accel/mlx5_ipsec_rxtx.c b/sys/dev/mlx5/mlx5_accel/mlx5_ipsec_rxtx.c index 5ff8e021b196..0883cfb2d510 100644 --- a/sys/dev/mlx5/mlx5_accel/mlx5_ipsec_rxtx.c +++ b/sys/dev/mlx5/mlx5_accel/mlx5_ipsec_rxtx.c @@ -35,36 +35,41 @@ #define MLX5_IPSEC_METADATA_HANDLE(ipsec_metadata) (ipsec_metadata & 0xFFFFFF) -int mlx5_accel_ipsec_rx_tag_add(if_t ifp, struct mbuf *mb) +int +mlx5_accel_ipsec_rx_tag_add(if_t ifp, struct mlx5e_rq_mbuf *mr) { struct mlx5e_priv *priv; - struct ipsec_accel_in_tag *tag; - struct m_tag *mtag; + struct ipsec_accel_in_tag *mtag; priv = if_getsoftc(ifp); if (priv->ipsec == NULL) return (0); + if (mr->ipsec_mtag != NULL) + return (0); - mtag = m_tag_get(PACKET_TAG_IPSEC_ACCEL_IN, sizeof(*tag), M_NOWAIT); + mtag = (struct ipsec_accel_in_tag *)m_tag_get( + PACKET_TAG_IPSEC_ACCEL_IN, sizeof(*mtag), M_NOWAIT); if (mtag == NULL) - return -ENOMEM; - - m_tag_prepend(mb, mtag); - return 0; + return (-ENOMEM); + mr->ipsec_mtag = mtag; + return (0); } -int mlx5e_accel_ipsec_handle_rx_cqe(struct mbuf *mb, struct mlx5_cqe64 *cqe) +void +mlx5e_accel_ipsec_handle_rx_cqe(struct mbuf *mb, struct mlx5_cqe64 *cqe, + struct mlx5e_rq_mbuf *mr) { - struct ipsec_accel_in_tag *tag; - u32 drv_spi; + struct ipsec_accel_in_tag *mtag; + u32 drv_spi; drv_spi = MLX5_IPSEC_METADATA_HANDLE(be32_to_cpu(cqe->ft_metadata)); - tag = (struct ipsec_accel_in_tag *) m_tag_find(mb, PACKET_TAG_IPSEC_ACCEL_IN, NULL); - WARN_ON(tag == NULL); - if (tag) - tag->drv_spi = drv_spi; - - return 0; + mtag = mr->ipsec_mtag; + WARN_ON(mtag == NULL); + mr->ipsec_mtag = NULL; + if (mtag != NULL) { + mtag->drv_spi = drv_spi; + m_tag_prepend(mb, &mtag->tag); + } } void diff --git a/sys/dev/mlx5/mlx5_en/en.h b/sys/dev/mlx5/mlx5_en/en.h index 8966aeacb890..80e0b7fbdedb 100644 --- a/sys/dev/mlx5/mlx5_en/en.h +++ b/sys/dev/mlx5/mlx5_en/en.h @@ -747,10 +747,13 @@ struct mlx5e_cq { struct mlx5_wq_ctrl wq_ctrl; } __aligned(MLX5E_CACHELINE_SIZE); +struct ipsec_accel_in_tag; + struct mlx5e_rq_mbuf { bus_dmamap_t dma_map; caddr_t data; struct mbuf *mbuf; + struct ipsec_accel_in_tag *ipsec_mtag; }; struct mlx5e_rq { diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_main.c b/sys/dev/mlx5/mlx5_en/mlx5_en_main.c index acb9f72b15c3..5081c1a0b782 100644 --- a/sys/dev/mlx5/mlx5_en/mlx5_en_main.c +++ b/sys/dev/mlx5/mlx5_en/mlx5_en_main.c @@ -1325,6 +1325,8 @@ mlx5e_destroy_rq(struct mlx5e_rq *rq) wq_sz = mlx5_wq_ll_get_size(&rq->wq); for (i = 0; i != wq_sz; i++) { if (rq->mbuf[i].mbuf != NULL) { + if (rq->mbuf[i].ipsec_mtag != NULL) + m_tag_free(&rq->mbuf[i].ipsec_mtag->tag); bus_dmamap_unload(rq->dma_tag, rq->mbuf[i].dma_map); m_freem(rq->mbuf[i].mbuf); } diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c b/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c index 3d4b75884354..a24bbe3d193e 100644 --- a/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c +++ b/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c @@ -70,7 +70,7 @@ mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq, /* get IP header aligned */ m_adj(mb, MLX5E_NET_IP_ALIGN); - err = mlx5_accel_ipsec_rx_tag_add(rq->ifp, mb); + err = mlx5_accel_ipsec_rx_tag_add(rq->ifp, &rq->mbuf[ix]); if (err) goto err_free_mbuf; err = -bus_dmamap_load_mbuf_sg(rq->dma_tag, rq->mbuf[ix].dma_map, @@ -277,9 +277,8 @@ mlx5e_mbuf_tstmp(struct mlx5e_priv *priv, uint64_t hw_tstmp) } static inline void -mlx5e_build_rx_mbuf(struct mlx5_cqe64 *cqe, - struct mlx5e_rq *rq, struct mbuf *mb, - u32 cqe_bcnt) +mlx5e_build_rx_mbuf(struct mlx5_cqe64 *cqe, struct mlx5e_rq *rq, + struct mbuf *mb, struct mlx5e_rq_mbuf *mr, u32 cqe_bcnt) { if_t ifp = rq->ifp; struct mlx5e_channel *c; @@ -423,7 +422,7 @@ mlx5e_build_rx_mbuf(struct mlx5_cqe64 *cqe, break; } - mlx5e_accel_ipsec_handle_rx(mb, cqe); + mlx5e_accel_ipsec_handle_rx(mb, cqe, mr); } static inline void @@ -588,7 +587,8 @@ mlx5e_poll_rx_cq(struct mlx5e_rq *rq, int budget) rq->mbuf[wqe_counter].dma_map); } rx_common: - mlx5e_build_rx_mbuf(cqe, rq, mb, byte_cnt); + mlx5e_build_rx_mbuf(cqe, rq, mb, &rq->mbuf[wqe_counter], + byte_cnt); rq->stats.bytes += byte_cnt; rq->stats.packets++; #ifdef NUMA From 828da10bb3c338b5964e120dd970e649730b7f4f Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Tue, 13 Aug 2024 11:38:01 +0300 Subject: [PATCH 065/145] mlx5en: fix destroying tx sa_entry when installing rx sa_entry failed In particular, do not cancel freed linux delayed work. Sponsored by: NVidia networking --- sys/dev/mlx5/mlx5_accel/mlx5_ipsec.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/sys/dev/mlx5/mlx5_accel/mlx5_ipsec.c b/sys/dev/mlx5/mlx5_accel/mlx5_ipsec.c index 555847717779..01d1cb28f86d 100644 --- a/sys/dev/mlx5/mlx5_accel/mlx5_ipsec.c +++ b/sys/dev/mlx5/mlx5_accel/mlx5_ipsec.c @@ -45,6 +45,8 @@ #define MLX5_IPSEC_RESCHED msecs_to_jiffies(1000) +static void mlx5e_if_sa_deinstall_onekey(struct ifnet *ifp, u_int dev_spi, + void *priv); static int mlx5e_if_sa_deinstall(struct ifnet *ifp, u_int dev_spi, void *priv); static struct mlx5e_ipsec_sa_entry *to_ipsec_sa_entry(void *x) @@ -378,6 +380,7 @@ mlx5e_if_sa_newkey_onedir(struct ifnet *ifp, void *sav, int dir, mlx5_ipsec_free_sa_ctx(sa_entry); err_sa_ctx: kfree(sa_entry->dwork); + sa_entry->dwork = NULL; err_xfrm: kfree(sa_entry); mlx5_en_err(ifp, "Device failed to offload this state"); @@ -403,7 +406,9 @@ mlx5e_if_sa_newkey(struct ifnet *ifp, void *sav, u_int dev_spi, void **privp) if (error == 0) { *privp = pb; } else { - mlx5e_if_sa_deinstall(ifp, dev_spi, pb->priv_in); + if (pb->priv_in->dwork != NULL) + cancel_delayed_work_sync(&pb->priv_in->dwork->dwork); + mlx5e_if_sa_deinstall_onekey(ifp, dev_spi, pb->priv_in); free(pb, M_DEVBUF); } return (error); From c4a0ee9b97bfc7407366567aaa2c09313b3e6bd2 Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Thu, 8 Aug 2024 14:43:28 +0300 Subject: [PATCH 066/145] ipsec_offload: add handler for interface down events Remove all offloaded SAs and SPs on ifdown. Sponsored by: NVIDIA networking --- sys/netipsec/ipsec_offload.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/sys/netipsec/ipsec_offload.c b/sys/netipsec/ipsec_offload.c index 984134539d8b..4d81803f4be7 100644 --- a/sys/netipsec/ipsec_offload.c +++ b/sys/netipsec/ipsec_offload.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -138,6 +139,8 @@ PCTRIE_DEFINE(DRVSPI_SA, ifp_handle_sav, drv_spi, drvspi_sa_trie_alloc, drvspi_sa_trie_free); static struct pctrie drv_spi_pctrie; +static eventhandler_tag ipsec_accel_ifdetach_event_tag; + static void ipsec_accel_sa_newkey_impl(struct secasvar *sav); static int ipsec_accel_handle_sav(struct secasvar *sav, struct ifnet *ifp, u_int drv_spi, void *priv, uint32_t flags, struct ifp_handle_sav **ires); @@ -154,6 +157,7 @@ static struct mbuf *ipsec_accel_key_setaccelif_impl(struct secasvar *sav); static void ipsec_accel_on_ifdown_impl(struct ifnet *ifp); static void ipsec_accel_drv_sa_lifetime_update_impl(struct secasvar *sav, if_t ifp, u_int drv_spi, uint64_t octets, uint64_t allocs); +static void ipsec_accel_ifdetach_event(void *arg, struct ifnet *ifp); static void ipsec_accel_init(void *arg) @@ -174,6 +178,9 @@ ipsec_accel_init(void *arg) ipsec_accel_drv_sa_lifetime_update_p = ipsec_accel_drv_sa_lifetime_update_impl; pctrie_init(&drv_spi_pctrie); + ipsec_accel_ifdetach_event_tag = EVENTHANDLER_REGISTER( + ifnet_departure_event, ipsec_accel_ifdetach_event, NULL, + EVENTHANDLER_PRI_ANY); } SYSINIT(ipsec_accel_init, SI_SUB_VNET_DONE, SI_ORDER_ANY, ipsec_accel_init, NULL); @@ -181,6 +188,8 @@ SYSINIT(ipsec_accel_init, SI_SUB_VNET_DONE, SI_ORDER_ANY, static void ipsec_accel_fini(void *arg) { + EVENTHANDLER_DEREGISTER(ifnet_departure_event, + ipsec_accel_ifdetach_event_tag); ipsec_accel_sa_newkey_p = NULL; ipsec_accel_forget_sav_p = NULL; ipsec_accel_spdadd_p = NULL; @@ -799,6 +808,14 @@ ipsec_accel_on_ifdown_impl(struct ifnet *ifp) ipsec_accel_on_ifdown_sav(ifp); } +static void +ipsec_accel_ifdetach_event(void *arg __unused, struct ifnet *ifp) +{ + if ((ifp->if_flags & IFF_RENAMING) != 0) + return; + ipsec_accel_on_ifdown_impl(ifp); +} + static bool ipsec_accel_output_pad(struct mbuf *m, struct secasvar *sav, int skip, int mtu) { From 4f4c34e9d6e1d496ac2c00d490c4218049375b4c Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Wed, 14 Aug 2024 06:06:55 +0300 Subject: [PATCH 067/145] if_vlan.c: remove stray include of sys/cdefs.h Sponsored by: NVidia networking --- sys/net/if_vlan.c | 1 - 1 file changed, 1 deletion(-) diff --git a/sys/net/if_vlan.c b/sys/net/if_vlan.c index e2b32ac2e7de..4349a6c99291 100644 --- a/sys/net/if_vlan.c +++ b/sys/net/if_vlan.c @@ -42,7 +42,6 @@ * use by the real outgoing interface, and ask it to send them. */ -#include #include "opt_inet.h" #include "opt_inet6.h" #include "opt_kern_tls.h" From 84abf7e26d1981a26cc3cd1842d5cefeb2253754 Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Wed, 17 Jul 2024 08:55:56 +0300 Subject: [PATCH 068/145] ipsec_offload: support vlans Sponsored by: NVIDIA networking --- sys/net/if_vlan.c | 91 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 90 insertions(+), 1 deletion(-) diff --git a/sys/net/if_vlan.c b/sys/net/if_vlan.c index 4349a6c99291..45489138fdef 100644 --- a/sys/net/if_vlan.c +++ b/sys/net/if_vlan.c @@ -44,6 +44,7 @@ #include "opt_inet.h" #include "opt_inet6.h" +#include "opt_ipsec.h" #include "opt_kern_tls.h" #include "opt_vlan.h" #include "opt_ratelimit.h" @@ -184,6 +185,7 @@ struct ifvlan { void *ifv_cookie; int ifv_pflags; /* special flags we have set on parent */ int ifv_capenable; + int ifv_capenable2; int ifv_encaplen; /* encapsulation length */ int ifv_mtufudge; /* MTU fudged by this much */ int ifv_mintu; /* min transmission unit */ @@ -1750,6 +1752,7 @@ vlan_config(struct ifvlan *ifv, struct ifnet *p, uint16_t vid, ifv->ifv_mintu = ETHERMIN; ifv->ifv_pflags = 0; ifv->ifv_capenable = -1; + ifv->ifv_capenable2 = -1; /* * If the parent supports the VLAN_MTU capability, @@ -2008,13 +2011,90 @@ vlan_link_state(struct ifnet *ifp) NET_EPOCH_EXIT(et); } +#ifdef IPSEC_OFFLOAD +#define VLAN_IPSEC_METHOD(exp) \ + if_t p; \ + struct ifvlan *ifv; \ + int error; \ + \ + ifv = ifp->if_softc; \ + VLAN_SLOCK(); \ + if (TRUNK(ifv) != NULL) { \ + p = PARENT(ifv); \ + if_ref(p); \ + error = p->if_ipsec_accel_m->exp; \ + if_rele(p); \ + } else { \ + error = ENXIO; \ + } \ + VLAN_SUNLOCK(); \ + return (error); + + +static int +vlan_if_spdadd(if_t ifp, void *sp, void *inp, void **priv) +{ + VLAN_IPSEC_METHOD(if_spdadd(ifp, sp, inp, priv)); +} + +static int +vlan_if_spddel(if_t ifp, void *sp, void *priv) +{ + VLAN_IPSEC_METHOD(if_spddel(ifp, sp, priv)); +} + +static int +vlan_if_sa_newkey(if_t ifp, void *sav, u_int drv_spi, void **privp) +{ + VLAN_IPSEC_METHOD(if_sa_newkey(ifp, sav, drv_spi, privp)); +} + +static int +vlan_if_sa_deinstall(if_t ifp, u_int drv_spi, void *priv) +{ + VLAN_IPSEC_METHOD(if_sa_deinstall(ifp, drv_spi, priv)); +} + +static int +vlan_if_sa_cnt(if_t ifp, void *sa, uint32_t drv_spi, void *priv, + struct seclifetime *lt) +{ + VLAN_IPSEC_METHOD(if_sa_cnt(ifp, sa, drv_spi, priv, lt)); +} + +static int +vlan_if_ipsec_hwassist(if_t ifp, void *sav, u_int drv_spi,void *priv) +{ + if_t trunk; + + NET_EPOCH_ASSERT(); + trunk = vlan_trunkdev(ifp); + if (trunk == NULL) + return (0); + return (trunk->if_ipsec_accel_m->if_hwassist(trunk, sav, + drv_spi, priv)); +} + +static const struct if_ipsec_accel_methods vlan_if_ipsec_accel_methods = { + .if_spdadd = vlan_if_spdadd, + .if_spddel = vlan_if_spddel, + .if_sa_newkey = vlan_if_sa_newkey, + .if_sa_deinstall = vlan_if_sa_deinstall, + .if_sa_cnt = vlan_if_sa_cnt, + .if_hwassist = vlan_if_ipsec_hwassist, +}; + +#undef VLAN_IPSEC_METHOD +#endif /* IPSEC_OFFLOAD */ + static void vlan_capabilities(struct ifvlan *ifv) { struct ifnet *p; struct ifnet *ifp; struct ifnet_hw_tsomax hw_tsomax; - int cap = 0, ena = 0, mena; + int cap = 0, ena = 0, mena, cap2 = 0, ena2 = 0; + int mena2 __unused; u_long hwa = 0; NET_EPOCH_ASSERT(); @@ -2025,6 +2105,7 @@ vlan_capabilities(struct ifvlan *ifv) /* Mask parent interface enabled capabilities disabled by user. */ mena = p->if_capenable & ifv->ifv_capenable; + mena2 = p->if_capenable2 & ifv->ifv_capenable2; /* * If the parent interface can do checksum offloading @@ -2129,7 +2210,15 @@ vlan_capabilities(struct ifvlan *ifv) ifp->if_capabilities = cap; ifp->if_capenable = ena; + ifp->if_capabilities2 = cap2; + ifp->if_capenable2 = ena2; ifp->if_hwassist = hwa; + +#ifdef IPSEC_OFFLOAD + cap2 |= p->if_capabilities2 & IFCAP2_BIT(IFCAP2_IPSEC_OFFLOAD); + ena2 |= mena2 & IFCAP2_BIT(IFCAP2_IPSEC_OFFLOAD); + ifp->if_ipsec_accel_m = &vlan_if_ipsec_accel_methods; +#endif } static void From 205263ac250aadd84931d2b77475bc931c3afeff Mon Sep 17 00:00:00 2001 From: Ariel Ehrenberg Date: Tue, 30 Jul 2024 19:50:40 +0300 Subject: [PATCH 069/145] mlx5en: support ipsec offload on vlan if Add vlan tag match to RX FS SA and policy rules and report SA lifetime counter on vlan interface in case SA was installed on vlan interface Existing code didn't have the net tag id as part of the FS matching rules. This can cause applying ipsec offload to the wrong interface. This commit add tag id as part of FS matchers and treat tag value 0 as no tag Sponsored by: NVidia networking --- sys/dev/mlx5/mlx5_accel/ipsec.h | 6 + sys/dev/mlx5/mlx5_accel/mlx5_ipsec.c | 90 ++++++++++---- sys/dev/mlx5/mlx5_accel/mlx5_ipsec_fs.c | 157 ++++++++++++++++++------ 3 files changed, 194 insertions(+), 59 deletions(-) diff --git a/sys/dev/mlx5/mlx5_accel/ipsec.h b/sys/dev/mlx5/mlx5_accel/ipsec.h index 2abd68d9770a..95742c4099f1 100644 --- a/sys/dev/mlx5/mlx5_accel/ipsec.h +++ b/sys/dev/mlx5/mlx5_accel/ipsec.h @@ -37,6 +37,8 @@ #define MLX5E_IPSEC_SADB_RX_BITS 10 #define MLX5_IPSEC_METADATA_MARKER(ipsec_metadata) ((ipsec_metadata >> 31) & 0x1) +#define VLAN_NONE 0xfff + struct mlx5e_priv; struct mlx5e_tx_wqe; struct mlx5e_ipsec_tx; @@ -135,6 +137,7 @@ struct mlx5e_ipsec_rule { struct mlx5_flow_handle *rule; struct mlx5_flow_handle *kspi_rule; struct mlx5_flow_handle *reqid_rule; + struct mlx5_flow_handle *vid_zero_rule; struct mlx5_modify_hdr *modify_hdr; struct mlx5_pkt_reformat *pkt_reformat; struct mlx5_fc *fc; @@ -149,6 +152,7 @@ struct mlx5e_ipsec_esn_state { struct mlx5e_ipsec_sa_entry { struct secasvar *savp; if_t ifp; + if_t ifpo; struct mlx5e_ipsec *ipsec; struct mlx5_accel_esp_xfrm_attrs attrs; struct mlx5e_ipsec_rule ipsec_rule; @@ -158,6 +162,7 @@ struct mlx5e_ipsec_sa_entry { u32 enc_key_id; u16 kspi; /* Stack allocated unique SA identifier */ struct mlx5e_ipsec_esn_state esn_state; + u16 vid; }; struct upspec { @@ -184,6 +189,7 @@ struct mlx5_accel_pol_xfrm_attrs { u8 dir : 2; u32 reqid; u32 prio; + u16 vid; }; struct mlx5e_ipsec_pol_entry { diff --git a/sys/dev/mlx5/mlx5_accel/mlx5_ipsec.c b/sys/dev/mlx5/mlx5_accel/mlx5_ipsec.c index 01d1cb28f86d..a25ed4c1c51f 100644 --- a/sys/dev/mlx5/mlx5_accel/mlx5_ipsec.c +++ b/sys/dev/mlx5/mlx5_accel/mlx5_ipsec.c @@ -97,8 +97,8 @@ mlx5e_ipsec_handle_counters(struct work_struct *_work) bytes += bytes1; #ifdef IPSEC_OFFLOAD - ipsec_accel_drv_sa_lifetime_update(sa_entry->savp, sa_entry->ifp, - sa_entry->kspi, bytes, packets); + ipsec_accel_drv_sa_lifetime_update( + sa_entry->savp, sa_entry->ifpo, sa_entry->kspi, bytes, packets); #endif queue_delayed_work(sa_entry->ipsec->wq, &dwork->dwork, @@ -321,19 +321,23 @@ static int mlx5e_xfrm_validate_state(struct mlx5_core_dev *mdev, } static int -mlx5e_if_sa_newkey_onedir(struct ifnet *ifp, void *sav, int dir, - u_int drv_spi, struct mlx5e_ipsec_sa_entry **privp, - struct mlx5e_ipsec_priv_bothdir *pb) +mlx5e_if_sa_newkey_onedir(struct ifnet *ifp, void *sav, int dir, u_int drv_spi, + struct mlx5e_ipsec_sa_entry **privp, struct mlx5e_ipsec_priv_bothdir *pb, + struct ifnet *ifpo) { struct mlx5e_ipsec_sa_entry *sa_entry = NULL; struct mlx5e_priv *priv = if_getsoftc(ifp); struct mlx5_core_dev *mdev = priv->mdev; struct mlx5e_ipsec *ipsec = priv->ipsec; + u16 vid = VLAN_NONE; int err; if (priv->gone != 0 || ipsec == NULL) return (EOPNOTSUPP); + if (if_gettype(ifpo) == IFT_L2VLAN) + VLAN_TAG(ifpo, &vid); + err = mlx5e_xfrm_validate_state(mdev, sav); if (err) return err; @@ -345,7 +349,9 @@ mlx5e_if_sa_newkey_onedir(struct ifnet *ifp, void *sav, int dir, sa_entry->kspi = drv_spi; sa_entry->savp = sav; sa_entry->ifp = ifp; + sa_entry->ifpo = ifpo; sa_entry->ipsec = ipsec; + sa_entry->vid = vid; mlx5e_ipsec_build_accel_xfrm_attrs(sa_entry, &sa_entry->attrs, dir); @@ -387,22 +393,35 @@ mlx5e_if_sa_newkey_onedir(struct ifnet *ifp, void *sav, int dir, return err; } +#define GET_TRUNK_IF(vifp, ifp, ept) \ + if (if_gettype(vifp) == IFT_L2VLAN) { \ + NET_EPOCH_ENTER(ept); \ + ifp = VLAN_TRUNKDEV(vifp); \ + NET_EPOCH_EXIT(ept); \ + } else { \ + ifp = vifp; \ + } + static int -mlx5e_if_sa_newkey(struct ifnet *ifp, void *sav, u_int dev_spi, void **privp) +mlx5e_if_sa_newkey(struct ifnet *ifpo, void *sav, u_int dev_spi, void **privp) { struct mlx5e_ipsec_priv_bothdir *pb; + struct epoch_tracker et; + struct ifnet *ifp; int error; + GET_TRUNK_IF(ifpo, ifp, et); + pb = malloc(sizeof(struct mlx5e_ipsec_priv_bothdir), M_DEVBUF, M_WAITOK | M_ZERO); - error = mlx5e_if_sa_newkey_onedir(ifp, sav, IPSEC_DIR_INBOUND, - dev_spi, &pb->priv_in, pb); + error = mlx5e_if_sa_newkey_onedir( + ifp, sav, IPSEC_DIR_INBOUND, dev_spi, &pb->priv_in, pb, ifpo); if (error != 0) { free(pb, M_DEVBUF); return (error); } - error = mlx5e_if_sa_newkey_onedir(ifp, sav, IPSEC_DIR_OUTBOUND, - dev_spi, &pb->priv_out, pb); + error = mlx5e_if_sa_newkey_onedir( + ifp, sav, IPSEC_DIR_OUTBOUND, dev_spi, &pb->priv_out, pb, ifpo); if (error == 0) { *privp = pb; } else { @@ -431,9 +450,13 @@ mlx5e_if_sa_deinstall_onekey(struct ifnet *ifp, u_int dev_spi, void *priv) } static int -mlx5e_if_sa_deinstall(struct ifnet *ifp, u_int dev_spi, void *priv) +mlx5e_if_sa_deinstall(struct ifnet *ifpo, u_int dev_spi, void *priv) { struct mlx5e_ipsec_priv_bothdir pb, *pbp; + struct epoch_tracker et; + struct ifnet *ifp; + + GET_TRUNK_IF(ifpo, ifp, et); pbp = priv; pb = *(struct mlx5e_ipsec_priv_bothdir *)priv; @@ -462,12 +485,16 @@ mlx5e_if_sa_cnt_one(struct ifnet *ifp, void *sa, uint32_t drv_spi, } static int -mlx5e_if_sa_cnt(struct ifnet *ifp, void *sa, uint32_t drv_spi, - void *priv, struct seclifetime *lt) +mlx5e_if_sa_cnt(struct ifnet *ifpo, void *sa, uint32_t drv_spi, void *priv, + struct seclifetime *lt) { struct mlx5e_ipsec_priv_bothdir *pb; u64 packets_in, packets_out; u64 bytes_in, bytes_out; + struct epoch_tracker et; + struct ifnet *ifp; + + GET_TRUNK_IF(ifpo, ifp, et); pb = priv; mlx5e_if_sa_cnt_one(ifp, sa, drv_spi, pb->priv_in, @@ -546,9 +573,9 @@ static int mlx5e_xfrm_validate_policy(struct mlx5_core_dev *mdev, return 0; } -static void mlx5e_ipsec_build_accel_pol_attrs(struct mlx5e_ipsec_pol_entry *pol_entry, - struct mlx5_accel_pol_xfrm_attrs *attrs, - struct inpcb *inp) +static void +mlx5e_ipsec_build_accel_pol_attrs(struct mlx5e_ipsec_pol_entry *pol_entry, + struct mlx5_accel_pol_xfrm_attrs *attrs, struct inpcb *inp, u16 vid) { struct secpolicy *sp = pol_entry->sp; struct secpolicyindex *spidx = &sp->spidx; @@ -592,15 +619,22 @@ static void mlx5e_ipsec_build_accel_pol_attrs(struct mlx5e_ipsec_pol_entry *pol_ attrs->action = IPSEC_POLICY_IPSEC; } attrs->dir = spidx->dir; + attrs->vid = vid; } -static int mlx5e_if_spd_install(struct ifnet *ifp, void *sp, void *inp1, - void **ifdatap) +static int +mlx5e_if_spd_install(struct ifnet *ifpo, void *sp, void *inp1, void **ifdatap) { struct mlx5e_ipsec_pol_entry *pol_entry; struct mlx5e_priv *priv; + struct epoch_tracker et; + u16 vid = VLAN_NONE; + struct ifnet *ifp; int err; + GET_TRUNK_IF(ifpo, ifp, et); + if (if_gettype(ifpo) == IFT_L2VLAN) + VLAN_TAG(ifpo, &vid); priv = if_getsoftc(ifp); if (priv->gone || !priv->ipsec) return (EOPNOTSUPP); @@ -616,7 +650,8 @@ static int mlx5e_if_spd_install(struct ifnet *ifp, void *sp, void *inp1, pol_entry->sp = sp; pol_entry->ipsec = priv->ipsec; - mlx5e_ipsec_build_accel_pol_attrs(pol_entry, &pol_entry->attrs, inp1); + mlx5e_ipsec_build_accel_pol_attrs(pol_entry, &pol_entry->attrs, + inp1, vid); err = mlx5e_accel_ipsec_fs_add_pol(pol_entry); if (err) goto err_pol; @@ -630,11 +665,12 @@ static int mlx5e_if_spd_install(struct ifnet *ifp, void *sp, void *inp1, return err; } - -static int mlx5e_if_spd_deinstall(struct ifnet *ifp, void *sp, void *ifdata) +static int +mlx5e_if_spd_deinstall(struct ifnet *ifpo, void *sp, void *ifdata) { - struct mlx5e_ipsec_pol_entry *pol_entry = to_ipsec_pol_entry(ifdata); + struct mlx5e_ipsec_pol_entry *pol_entry; + pol_entry = to_ipsec_pol_entry(ifdata); mlx5e_accel_ipsec_fs_del_pol(pol_entry); kfree(pol_entry); return 0; @@ -654,9 +690,17 @@ void mlx5e_ipsec_cleanup(struct mlx5e_priv *priv) } static int -mlx5e_if_ipsec_hwassist(if_t ifnet, void *sav __unused, +mlx5e_if_ipsec_hwassist(if_t ifneto, void *sav __unused, uint32_t drv_spi __unused, void *priv __unused) { + if_t ifnet; + + if (if_gettype(ifneto) == IFT_L2VLAN) { + ifnet = VLAN_TRUNKDEV(ifneto); + } else { + ifnet = ifneto; + } + return (if_gethwassist(ifnet) & (CSUM_TSO | CSUM_TCP | CSUM_UDP | CSUM_IP | CSUM_IP6_TSO | CSUM_IP6_TCP | CSUM_IP6_UDP)); } diff --git a/sys/dev/mlx5/mlx5_accel/mlx5_ipsec_fs.c b/sys/dev/mlx5/mlx5_accel/mlx5_ipsec_fs.c index a6a0398f9dca..e348ab1992a5 100644 --- a/sys/dev/mlx5/mlx5_accel/mlx5_ipsec_fs.c +++ b/sys/dev/mlx5/mlx5_accel/mlx5_ipsec_fs.c @@ -185,6 +185,44 @@ static void setup_fte_spi(struct mlx5_flow_spec *spec, u32 spi, bool encap) } } +static void +setup_fte_vid(struct mlx5_flow_spec *spec, u16 vid) +{ + /* virtual lan tag */ + spec->match_criteria_enable |= MLX5_MATCH_OUTER_HEADERS; + + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, + outer_headers.cvlan_tag); + MLX5_SET(fte_match_param, spec->match_value, + outer_headers.cvlan_tag, 1); + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, + outer_headers.first_vid); + MLX5_SET(fte_match_param, spec->match_value, outer_headers.first_vid, + vid); +} + +static void +clear_fte_vid(struct mlx5_flow_spec *spec) +{ + MLX5_SET(fte_match_param, spec->match_criteria, + outer_headers.cvlan_tag, 0); + MLX5_SET(fte_match_param, spec->match_value, + outer_headers.cvlan_tag, 0); + MLX5_SET(fte_match_param, spec->match_criteria, + outer_headers.first_vid, 0); + MLX5_SET(fte_match_param, spec->match_value, + outer_headers.first_vid, 0); +} + +static void +setup_fte_no_vid(struct mlx5_flow_spec *spec) +{ + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, + outer_headers.cvlan_tag); + MLX5_SET(fte_match_param, spec->match_value, + outer_headers.cvlan_tag, 0); +} + static struct mlx5_fs_chains * ipsec_chains_create(struct mlx5_core_dev *mdev, struct mlx5_flow_table *miss_ft, enum mlx5_flow_namespace_type ns, int base_prio, @@ -474,17 +512,6 @@ static int rx_add_rule(struct mlx5e_ipsec_sa_entry *sa_entry) if (!spec) return -ENOMEM; - if (attrs->family == AF_INET) - setup_fte_addr4(spec, &attrs->saddr.a4, &attrs->daddr.a4); - else - setup_fte_addr6(spec, attrs->saddr.a6, attrs->daddr.a6); - - if (!attrs->encap) - setup_fte_esp(spec); - - setup_fte_spi(spec, attrs->spi, attrs->encap); - setup_fte_no_frags(spec); - if (!attrs->drop) { err = setup_modify_header(mdev, sa_entry->kspi | BIT(31), IPSEC_DIR_INBOUND, &flow_act); @@ -520,15 +547,46 @@ static int rx_add_rule(struct mlx5e_ipsec_sa_entry *sa_entry) dest[1].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; dest[1].counter_id = mlx5_fc_id(counter); + if (attrs->family == AF_INET) + setup_fte_addr4(spec, &attrs->saddr.a4, &attrs->daddr.a4); + else + setup_fte_addr6(spec, attrs->saddr.a6, attrs->daddr.a6); + + if (!attrs->encap) + setup_fte_esp(spec); + + setup_fte_spi(spec, attrs->spi, attrs->encap); + setup_fte_no_frags(spec); + + if (sa_entry->vid != VLAN_NONE) + setup_fte_vid(spec, sa_entry->vid); + else + setup_fte_no_vid(spec); + rule = mlx5_add_flow_rules(rx->ft.sa, spec, &flow_act, dest, 2); if (IS_ERR(rule)) { err = PTR_ERR(rule); mlx5_core_err(mdev, "fail to add RX ipsec rule err=%d\n", err); goto err_add_flow; } + ipsec_rule->rule = rule; + + /* Add another rule for zero vid */ + if (sa_entry->vid == VLAN_NONE) { + clear_fte_vid(spec); + setup_fte_vid(spec, 0); + rule = mlx5_add_flow_rules(rx->ft.sa, spec, &flow_act, dest, 2); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + mlx5_core_err(mdev, + "fail to add RX ipsec zero vid rule err=%d\n", + err); + goto err_add_flow; + } + ipsec_rule->vid_zero_rule = rule; + } kvfree(spec); - ipsec_rule->rule = rule; ipsec_rule->fc = counter; ipsec_rule->modify_hdr = flow_act.modify_hdr; ipsec_rule->pkt_reformat = flow_act.pkt_reformat; @@ -536,10 +594,12 @@ static int rx_add_rule(struct mlx5e_ipsec_sa_entry *sa_entry) err_add_flow: mlx5_fc_destroy(mdev, counter); + if (ipsec_rule->rule != NULL) + mlx5_del_flow_rules(&ipsec_rule->rule); err_add_cnt: mlx5_packet_reformat_dealloc(mdev, flow_act.pkt_reformat); err_pkt_reformat: - if (flow_act.modify_hdr) + if (flow_act.modify_hdr != NULL) mlx5_modify_header_dealloc(mdev, flow_act.modify_hdr); err_mod_header: kvfree(spec); @@ -1222,8 +1282,6 @@ static int tx_add_policy(struct mlx5e_ipsec_pol_entry *pol_entry) switch (attrs->action) { case IPSEC_POLICY_IPSEC: flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; - /*if (!attrs->reqid) - break;*/ err = setup_modify_header(mdev, attrs->reqid, IPSEC_DIR_OUTBOUND, &flow_act); if (err) @@ -1278,7 +1336,7 @@ static int rx_add_policy(struct mlx5e_ipsec_pol_entry *pol_entry) struct mlx5_flow_spec *spec; struct mlx5_flow_table *ft; struct mlx5e_ipsec_rx *rx; - int err, dstn = 0; + int err, dstn = 0; rx = (attrs->family == AF_INET) ? ipsec->rx_ipv4 : ipsec->rx_ipv6; ft = rx->chains ? ipsec_chains_get_table(rx->chains, attrs->prio) : rx->ft.pol; @@ -1291,14 +1349,6 @@ static int rx_add_policy(struct mlx5e_ipsec_pol_entry *pol_entry) goto err_alloc; } - if (attrs->family == AF_INET) - setup_fte_addr4(spec, &attrs->saddr.a4, &attrs->daddr.a4); - else - setup_fte_addr6(spec, attrs->saddr.a6, attrs->daddr.a6); - - setup_fte_no_frags(spec); - setup_fte_upper_proto_match(spec, &attrs->upspec); - switch (attrs->action) { case IPSEC_POLICY_IPSEC: flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; @@ -1318,21 +1368,52 @@ static int rx_add_policy(struct mlx5e_ipsec_pol_entry *pol_entry) dest[dstn].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; dest[dstn].ft = rx->ft.sa; dstn++; - rule = mlx5_add_flow_rules(ft, spec, &flow_act, dest, dstn); - if (IS_ERR(rule)) { - err = PTR_ERR(rule); - mlx5_core_err(mdev, "Fail to add RX IPsec policy rule err=%d\n", err); - goto err_action; - } - kvfree(spec); - pol_entry->ipsec_rule.rule = rule; + if (attrs->family == AF_INET) + setup_fte_addr4(spec, &attrs->saddr.a4, &attrs->daddr.a4); + else + setup_fte_addr6(spec, attrs->saddr.a6, attrs->daddr.a6); + + setup_fte_no_frags(spec); + setup_fte_upper_proto_match(spec, &attrs->upspec); + if (attrs->vid != VLAN_NONE) + setup_fte_vid(spec, attrs->vid); + else + setup_fte_no_vid(spec); + + rule = mlx5_add_flow_rules(ft, spec, &flow_act, dest, dstn); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + mlx5_core_err(mdev, + "Failed to add RX IPsec policy rule err=%d\n", err); + goto err_action; + } + pol_entry->ipsec_rule.rule = rule; + + /* Add also rule for zero vid */ + if (attrs->vid == VLAN_NONE) { + clear_fte_vid(spec); + setup_fte_vid(spec, 0); + rule = mlx5_add_flow_rules(ft, spec, &flow_act, dest, dstn); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + mlx5_core_err(mdev, + "Failed to add RX IPsec policy rule err=%d\n", + err); + goto err_action; + } + pol_entry->ipsec_rule.vid_zero_rule = rule; + } + + kvfree(spec); return 0; err_action: - kvfree(spec); + if (pol_entry->ipsec_rule.rule != NULL) + mlx5_del_flow_rules(&pol_entry->ipsec_rule.rule); + kvfree(spec); err_alloc: - if (rx->chains) + if (rx->chains != NULL) ipsec_chains_put_table(rx->chains, attrs->prio); return err; } @@ -1854,7 +1935,9 @@ void mlx5e_accel_ipsec_fs_del_rule(struct mlx5e_ipsec_sa_entry *sa_entry) mlx5_del_flow_rules(&ipsec_rule->rule); mlx5_del_flow_rules(&ipsec_rule->kspi_rule); - if (ipsec_rule->reqid_rule) + if (ipsec_rule->vid_zero_rule != NULL) + mlx5_del_flow_rules(&ipsec_rule->vid_zero_rule); + if (ipsec_rule->reqid_rule != NULL) mlx5_del_flow_rules(&ipsec_rule->reqid_rule); mlx5_fc_destroy(mdev, ipsec_rule->fc); mlx5_packet_reformat_dealloc(mdev, ipsec_rule->pkt_reformat); @@ -1863,7 +1946,7 @@ void mlx5e_accel_ipsec_fs_del_rule(struct mlx5e_ipsec_sa_entry *sa_entry) return; } - if (ipsec_rule->modify_hdr) + if (ipsec_rule->modify_hdr != NULL) mlx5_modify_header_dealloc(mdev, ipsec_rule->modify_hdr); } @@ -1881,6 +1964,8 @@ void mlx5e_accel_ipsec_fs_del_pol(struct mlx5e_ipsec_pol_entry *pol_entry) struct mlx5_core_dev *mdev = mlx5e_ipsec_pol2dev(pol_entry); mlx5_del_flow_rules(&ipsec_rule->rule); + if (ipsec_rule->vid_zero_rule != NULL) + mlx5_del_flow_rules(&ipsec_rule->vid_zero_rule); if (pol_entry->attrs.dir == IPSEC_DIR_INBOUND) { struct mlx5e_ipsec_rx *rx; From 66f0e2017f7cd804f31ae4fc2ad38607d85a08d3 Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Sun, 18 Aug 2024 16:22:28 +0300 Subject: [PATCH 070/145] ipsec_offload: add ipsec_accel_drv_sa_lifetime_fetch() A function to fetch hardware counters for offloaded SA on specific interface. Sponsored by: NVidia networking --- sys/netipsec/ipsec_offload.c | 29 +++++++++++++++++++++++++++++ sys/netipsec/ipsec_offload.h | 4 ++++ sys/netipsec/key.c | 15 +++++++++++++++ 3 files changed, 48 insertions(+) diff --git a/sys/netipsec/ipsec_offload.c b/sys/netipsec/ipsec_offload.c index 4d81803f4be7..bbf98ac7a676 100644 --- a/sys/netipsec/ipsec_offload.c +++ b/sys/netipsec/ipsec_offload.c @@ -157,6 +157,8 @@ static struct mbuf *ipsec_accel_key_setaccelif_impl(struct secasvar *sav); static void ipsec_accel_on_ifdown_impl(struct ifnet *ifp); static void ipsec_accel_drv_sa_lifetime_update_impl(struct secasvar *sav, if_t ifp, u_int drv_spi, uint64_t octets, uint64_t allocs); +static int ipsec_accel_drv_sa_lifetime_fetch_impl(struct secasvar *sav, + if_t ifp, u_int drv_spi, uint64_t *octets, uint64_t *allocs); static void ipsec_accel_ifdetach_event(void *arg, struct ifnet *ifp); static void @@ -177,6 +179,8 @@ ipsec_accel_init(void *arg) ipsec_accel_on_ifdown_p = ipsec_accel_on_ifdown_impl; ipsec_accel_drv_sa_lifetime_update_p = ipsec_accel_drv_sa_lifetime_update_impl; + ipsec_accel_drv_sa_lifetime_fetch_p = + ipsec_accel_drv_sa_lifetime_fetch_impl; pctrie_init(&drv_spi_pctrie); ipsec_accel_ifdetach_event_tag = EVENTHANDLER_REGISTER( ifnet_departure_event, ipsec_accel_ifdetach_event, NULL, @@ -200,6 +204,7 @@ ipsec_accel_fini(void *arg) ipsec_accel_key_setaccelif_p = NULL; ipsec_accel_on_ifdown_p = NULL; ipsec_accel_drv_sa_lifetime_update_p = NULL; + ipsec_accel_drv_sa_lifetime_fetch_p = NULL; ipsec_accel_sync_imp(); clean_unrhdr(drv_spi_unr); /* avoid panic, should go later */ clear_unrhdr(drv_spi_unr); @@ -1017,6 +1022,30 @@ ipsec_accel_drv_sa_lifetime_update_impl(struct secasvar *sav, if_t ifp, NET_EPOCH_EXIT(et); } +static int +ipsec_accel_drv_sa_lifetime_fetch_impl(struct secasvar *sav, + if_t ifp, u_int drv_spi, uint64_t *octets, uint64_t *allocs) +{ + struct ifp_handle_sav *i; + int error; + + NET_EPOCH_ASSERT(); + error = 0; + + mtx_lock(&ipsec_accel_cnt_lock); + CK_LIST_FOREACH(i, &sav->accel_ifps, sav_link) { + if (i->ifp == ifp && i->drv_spi == drv_spi) { + *octets = i->cnt_octets; + *allocs = i->cnt_allocs; + break; + } + } + if (i == NULL) + error = ENOENT; + mtx_unlock(&ipsec_accel_cnt_lock); + return (error); +} + static void ipsec_accel_sa_lifetime_hw(struct secasvar *sav, if_t ifp, struct seclifetime *lft) diff --git a/sys/netipsec/ipsec_offload.h b/sys/netipsec/ipsec_offload.h index 72055a110951..904fe6252396 100644 --- a/sys/netipsec/ipsec_offload.h +++ b/sys/netipsec/ipsec_offload.h @@ -64,6 +64,8 @@ extern struct mbuf *(*ipsec_accel_key_setaccelif_p)(struct secasvar *sav); extern void (*ipsec_accel_on_ifdown_p)(struct ifnet *ifp); extern void (*ipsec_accel_drv_sa_lifetime_update_p)(struct secasvar *sav, if_t ifp, u_int drv_spi, uint64_t octets, uint64_t allocs); +extern int (*ipsec_accel_drv_sa_lifetime_fetch_p)(struct secasvar *sav, + if_t ifp, u_int drv_spi, uint64_t *octets, uint64_t *allocs); #ifdef IPSEC_OFFLOAD /* @@ -191,6 +193,8 @@ struct ipsec_accel_in_tag *ipsec_accel_input_tag_lookup(const struct mbuf *); void ipsec_accel_on_ifdown(struct ifnet *ifp); void ipsec_accel_drv_sa_lifetime_update(struct secasvar *sav, if_t ifp, u_int drv_spi, uint64_t octets, uint64_t allocs); +int ipsec_accel_drv_sa_lifetime_fetch(struct secasvar *sav, + if_t ifp, u_int drv_spi, uint64_t *octets, uint64_t *allocs); #endif /* _KERNEL */ diff --git a/sys/netipsec/key.c b/sys/netipsec/key.c index 149173e0b5f6..5a3e5727bc2e 100644 --- a/sys/netipsec/key.c +++ b/sys/netipsec/key.c @@ -112,6 +112,8 @@ struct mbuf *(*ipsec_accel_key_setaccelif_p)(struct secasvar *sav); void (*ipsec_accel_on_ifdown_p)(struct ifnet *ifp); void (*ipsec_accel_drv_sa_lifetime_update_p)(struct secasvar *sav, if_t ifp, u_int drv_spi, uint64_t octets, uint64_t allocs); +int (*ipsec_accel_drv_sa_lifetime_fetch_p)(struct secasvar *sav, if_t ifp, + u_int drv_spi, uint64_t *octets, uint64_t *allocs); #endif #define FULLMASK 0xff @@ -8990,4 +8992,17 @@ ipsec_accel_drv_sa_lifetime_update(struct secasvar *sav, if_t ifp, if (p != NULL) p(sav, ifp, drv_spi, octets, allocs); } + +int +ipsec_accel_drv_sa_lifetime_fetch(struct secasvar *sav, + if_t ifp, u_int drv_spi, uint64_t *octets, uint64_t *allocs) +{ + int (*p)(struct secasvar *sav, if_t ifp, u_int drv_spi, + uint64_t *octets, uint64_t *allocs); + + p = atomic_load_ptr(&ipsec_accel_drv_sa_lifetime_fetch_p); + if (p == NULL) + return (EOPNOTSUPP); + return (p(sav, ifp, drv_spi, octets, allocs)); +} #endif From e9ace6e8f8fb69cbc4973bc2d3ae1b4088cc49dd Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Thu, 15 Aug 2024 06:34:58 +0300 Subject: [PATCH 071/145] net/if_lagg.c: remove stray include is sys/cdefs.h Sponsored by: NVidia networking --- sys/net/if_lagg.c | 1 - 1 file changed, 1 deletion(-) diff --git a/sys/net/if_lagg.c b/sys/net/if_lagg.c index 2f558fba32de..988f23fc029c 100644 --- a/sys/net/if_lagg.c +++ b/sys/net/if_lagg.c @@ -18,7 +18,6 @@ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ -#include #include "opt_inet.h" #include "opt_inet6.h" #include "opt_kern_tls.h" From d1e78fbd4a67245e058e9d26b333216093f8d2d1 Mon Sep 17 00:00:00 2001 From: Wolfram Schneider Date: Tue, 20 Aug 2024 13:26:46 +0000 Subject: [PATCH 072/145] bsd-family-tree: add NetBSD 9.4 --- share/misc/bsd-family-tree | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/share/misc/bsd-family-tree b/share/misc/bsd-family-tree index 4f9ee8d438d8..5b5bf87557b8 100644 --- a/share/misc/bsd-family-tree +++ b/share/misc/bsd-family-tree @@ -438,23 +438,23 @@ FreeBSD 5.2 | | | | | 13.1 | | | | | | | | | | | | | DragonFly 6.2.2 | | | | | NetBSD 9.3 | | - | | | macOS | OpenBSD 7.2 | - | | | 13 | | | - | | FreeBSD | | | | - | | 12.4 | | | | - | | | | | DragonFly 6.4.0 - | | | | OpenBSD 7.3 | - | FreeBSD | | | | - | 13.2 | | | | - | | | | | | - | `------. | | | | - | | macOS | | | - | | 14 | | | - | | | | OpenBSD 7.4 | - *--FreeBSD | | | | | - | 14.0 | | | | | - | | | | | | | - | | FreeBSD | | | | + | | | macOS | | OpenBSD 7.2 | + | | | 13 | | | | + | | FreeBSD | | | | | + | | 12.4 | | | | | + | | | | | | DragonFly 6.4.0 + | | | | | OpenBSD 7.3 | + | FreeBSD | | | | | + | 13.2 | | | | | + | | | | | | | + | `------. | | | | | + | | macOS | | | | + | | 14 | | | | + | | | | | OpenBSD 7.4 | + *--FreeBSD | | | | | | + | 14.0 | | | | | | + | | | | | | | | + | | FreeBSD | | NetBSD 9.4 | | | | 13.3 | | | | | | | *--NetBSD | | | | | | 10.0 | | @@ -894,6 +894,7 @@ FreeBSD 14.0 2023-11-20 [FBD] FreeBSD 13.3 2024-03-05 [FBD] NetBSD 10.0 2024-03-28 [NBD] OpenBSD 7.5 2024-04-05 [OBD] +NetBSD 9.4 2024-04-20 [NBD] FreeBSD 14.1 2024-06-04 [FBD] Bibliography @@ -959,5 +960,5 @@ original BSD announcements from Usenet or tapes. Steven M. Schultz for providing 2.8BSD, 2.10BSD, 2.11BSD manual pages. -- -Copyright (c) 1997-2023 Wolfram Schneider +Copyright (c) 1997-2024 Wolfram Schneider URL: https://cgit.freebsd.org/src/tree/share/misc/bsd-family-tree From d4a4d1e742852b00427f723f59534b42718628de Mon Sep 17 00:00:00 2001 From: Wolfram Schneider Date: Tue, 20 Aug 2024 13:29:18 +0000 Subject: [PATCH 073/145] bsd-family-tree: shorter URLs --- share/misc/bsd-family-tree | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/share/misc/bsd-family-tree b/share/misc/bsd-family-tree index 5b5bf87557b8..c4a7790b773d 100644 --- a/share/misc/bsd-family-tree +++ b/share/misc/bsd-family-tree @@ -492,7 +492,7 @@ the announcement in Usenet or if it was available as tape. [QCU] Salus, Peter H. A quarter century of UNIX. ISBN 0201547775, EAN 9780201547771 [SMS] Steven M. Schultz. 2.11BSD, UNIX for the PDP-11. -[TUHS] The Unix Historical Society. https://minnie.tuhs.org/Unix_History/. +[TUHS] The Unix Historical Society. https://minnie.tuhs.org/Unix_History/ [USE] Usenet announcement. [WRS] Wind River Systems, Inc. [dmr] Dennis Ritchie, via E-Mail @@ -941,7 +941,7 @@ FreeBSD Documentation Archive URL: https://docs-archive.freebsd.org/doc/ UNIX history graphing project -URL: https://minnie.tuhs.org/Unix_History/index.html +URL: https://minnie.tuhs.org/Unix_History/ UNIX history URL: https://www.levenez.com/unix/ From e5b85380836378c9e321a4e6d300591e6faf622a Mon Sep 17 00:00:00 2001 From: Eugene Grosbein Date: Tue, 20 Aug 2024 21:00:35 +0700 Subject: [PATCH 074/145] libalias: add another check to previous change If UseLink() returns NULL, it is possible that Deletelink() has already freed "grp", so check it out carefully. PR: 269770 Reported by: Peter Much X-MFC-With: 8132e959099f0c533f698d8fbc17386f9144432f --- sys/netinet/libalias/alias_db.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sys/netinet/libalias/alias_db.c b/sys/netinet/libalias/alias_db.c index d516b6cda96c..4bb95549aaaf 100644 --- a/sys/netinet/libalias/alias_db.c +++ b/sys/netinet/libalias/alias_db.c @@ -875,6 +875,9 @@ _FindLinkIn(struct libalias *la, struct in_addr dst_addr, if (found != NULL) return (found); /* link expired */ + grp = StartPointIn(la, alias_addr, alias_port, link_type, 0); + if (grp == NULL) + return (NULL); break; } } From b49aec04f073de02b03b44503feffeb52cbbdd51 Mon Sep 17 00:00:00 2001 From: Wolfram Schneider Date: Tue, 20 Aug 2024 14:35:29 +0000 Subject: [PATCH 075/145] bsd-family-tree: fix macOS 11 release date --- share/misc/bsd-family-tree | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/share/misc/bsd-family-tree b/share/misc/bsd-family-tree index c4a7790b773d..1d8dacf393be 100644 --- a/share/misc/bsd-family-tree +++ b/share/misc/bsd-family-tree @@ -868,7 +868,7 @@ DragonFly 5.8.3 2020-09-24 [DFB] OpenBSD 6.8 2020-10-18 [OBD] NetBSD 9.1 2020-10-18 [NBD] FreeBSD 12.2 2020-10-27 [FBD] -macOS 11 2020-11-19 [APL] +macOS 11 2020-11-12 [APL] FreeBSD 13.0 2021-04-13 [FBD] OpenBSD 6.9 2021-05-01 [OBD] DragonFly 6.0 2021-05-08 [DFB] From 66aed7e3488aa60195abcf846da5e04aa82fb1bf Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Tue, 20 Aug 2024 14:52:02 +0000 Subject: [PATCH 076/145] socket: Set lock flags properly Fixes: fb901935f257 ("socket: Split up sosend_generic()") Reported by: cy Sponsored by: Klara, Inc. Sponsored by: Stormshield --- sys/kern/uipc_socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c index 13b6253bd115..42c43539b484 100644 --- a/sys/kern/uipc_socket.c +++ b/sys/kern/uipc_socket.c @@ -1920,7 +1920,7 @@ sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, { int error; - error = SOCK_IO_SEND_LOCK(so, 0); + error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags)); if (error) return (error); error = sosend_generic_locked(so, addr, uio, top, control, flags, td); From aa6c490bf80fcef15cfc0d3f562fae19ef2375aa Mon Sep 17 00:00:00 2001 From: Michael Tuexen Date: Tue, 20 Aug 2024 17:30:55 +0200 Subject: [PATCH 077/145] tcp: initialize the LRO hash table with correct size There will at most lro_entries entries in the LRO hash table. So no need to take lro_mbufs into account, which only results in the LRO hash table being too large and therefore wasting memory. Reviewed by: rrs MFC after: 1 week Sponsored by: Netflix, Inc. Differential Revision: https://reviews.freebsd.org/D46378 --- sys/netinet/tcp_lro.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/sys/netinet/tcp_lro.c b/sys/netinet/tcp_lro.c index 921d28f82517..2603815f9e61 100644 --- a/sys/netinet/tcp_lro.c +++ b/sys/netinet/tcp_lro.c @@ -175,7 +175,7 @@ tcp_lro_init_args(struct lro_ctrl *lc, struct ifnet *ifp, { struct lro_entry *le; size_t size; - unsigned i, elements; + unsigned i; lc->lro_bad_csum = 0; lc->lro_queued = 0; @@ -190,11 +190,7 @@ tcp_lro_init_args(struct lro_ctrl *lc, struct ifnet *ifp, LIST_INIT(&lc->lro_active); /* create hash table to accelerate entry lookup */ - if (lro_entries > lro_mbufs) - elements = lro_entries; - else - elements = lro_mbufs; - lc->lro_hash = phashinit_flags(elements, M_LRO, &lc->lro_hashsz, + lc->lro_hash = phashinit_flags(lro_entries, M_LRO, &lc->lro_hashsz, HASH_NOWAIT); if (lc->lro_hash == NULL) { memset(lc, 0, sizeof(*lc)); From 0875f3cd74b2f305e82bff4e640c89f891ca84f8 Mon Sep 17 00:00:00 2001 From: Ed Maste Date: Tue, 20 Aug 2024 11:43:11 -0400 Subject: [PATCH 078/145] Revert "x86: Enable Intel DMAR by default" A number of people have reported panics with it enabled by default, possibly due to broken ACPI tables, which we do not handle well. D46382 is a potential fix for this issue. Additionally DMAR is currently not compatible with bhyve passthrough (see comment #10 in PR280817), with a draft patch to address that in D25672. Revert to disabling DMAR by default pending the resolution of those two issues. This reverts commit 3192fc30230ae432b80cca783abc2dbea9d3f383. PR: 280817 Sponsored by: The FreeBSD Foundation --- sys/x86/iommu/intel_drv.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sys/x86/iommu/intel_drv.c b/sys/x86/iommu/intel_drv.c index 9401892aa9d6..0b25620114cd 100644 --- a/sys/x86/iommu/intel_drv.c +++ b/sys/x86/iommu/intel_drv.c @@ -159,8 +159,7 @@ dmar_count_iter(ACPI_DMAR_HEADER *dmarh, void *arg) int dmar_rmrr_enable = 1; -static int dmar_enable = 1; - +static int dmar_enable = 0; static void dmar_identify(driver_t *driver, device_t parent) { From aa72c5bacb5f7ab359a3b7ae07b7c7b8705b60a8 Mon Sep 17 00:00:00 2001 From: Gleb Smirnoff Date: Tue, 20 Aug 2024 13:45:38 -0700 Subject: [PATCH 079/145] dummymbuf: fix build without INET or INET6 Note that VIMAGE would mask both compilation failures. --- sys/net/dummymbuf.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/sys/net/dummymbuf.c b/sys/net/dummymbuf.c index cb92889c5b77..8c46421888ed 100644 --- a/sys/net/dummymbuf.c +++ b/sys/net/dummymbuf.c @@ -117,11 +117,15 @@ SYSCTL_PROC(_net_dummymbuf, OID_AUTO, hits, * pfil(9) context */ +#ifdef INET VNET_DEFINE_STATIC(pfil_hook_t, dmb_pfil_inet_hook); #define V_dmb_pfil_inet_hook VNET(dmb_pfil_inet_hook) +#endif +#ifdef INET6 VNET_DEFINE_STATIC(pfil_hook_t, dmb_pfil_inet6_hook); #define V_dmb_pfil_inet6_hook VNET(dmb_pfil_inet6_hook) +#endif VNET_DEFINE_STATIC(pfil_hook_t, dmb_pfil_ethernet_hook); #define V_dmb_pfil_ethernet_hook VNET(dmb_pfil_ethernet_hook) @@ -321,6 +325,7 @@ dmb_pfil_mbuf_chk(int pfil_type, struct mbuf **mp, struct ifnet *ifp, return (PFIL_PASS); } +#ifdef INET static pfil_return_t dmb_pfil_inet_mbuf_chk(struct mbuf **mp, struct ifnet *ifp, int flags, void *ruleset, struct inpcb *inp) @@ -328,7 +333,9 @@ dmb_pfil_inet_mbuf_chk(struct mbuf **mp, struct ifnet *ifp, int flags, return (dmb_pfil_mbuf_chk(PFIL_TYPE_IP4, mp, ifp, flags, ruleset, inp)); } +#endif +#ifdef INET6 static pfil_return_t dmb_pfil_inet6_mbuf_chk(struct mbuf **mp, struct ifnet *ifp, int flags, void *ruleset, struct inpcb *inp) @@ -336,6 +343,7 @@ dmb_pfil_inet6_mbuf_chk(struct mbuf **mp, struct ifnet *ifp, int flags, return (dmb_pfil_mbuf_chk(PFIL_TYPE_IP6, mp, ifp, flags, ruleset, inp)); } +#endif static pfil_return_t dmb_pfil_ethernet_mbuf_chk(struct mbuf **mp, struct ifnet *ifp, int flags, From 6bc966987e7a6e5bcf9568b5c8b17037787c33a2 Mon Sep 17 00:00:00 2001 From: Gleb Smirnoff Date: Tue, 20 Aug 2024 13:46:09 -0700 Subject: [PATCH 080/145] dummymbuf: add to LINT --- sys/conf/NOTES | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sys/conf/NOTES b/sys/conf/NOTES index a531620ede55..47d6bcb4e5da 100644 --- a/sys/conf/NOTES +++ b/sys/conf/NOTES @@ -977,6 +977,9 @@ device lagg # WireGuard interface. device wg +# dummymbuf – mbuf alteration pfil hooks +device dummymbuf + # # Internet family options: # From 417b35a97b7669eb0bf417b43e97cccbedbce6f9 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Tue, 20 Aug 2024 21:31:57 +0000 Subject: [PATCH 081/145] netinet: Add a sysctl to allow disabling connections to INADDR_ANY See the discussion in Bugzilla PR 280705 for context. PR: 280705 MFC after: 1 week Differential Revision: https://reviews.freebsd.org/D46259 --- sys/netinet/in_pcb.c | 8 +++++++- sys/netinet6/in6_pcb.c | 12 +++++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c index 1a341d421f31..3fc90f1e12c2 100644 --- a/sys/netinet/in_pcb.c +++ b/sys/netinet/in_pcb.c @@ -234,6 +234,12 @@ in_pcbhashseed_init(void) VNET_SYSINIT(in_pcbhashseed_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST, in_pcbhashseed_init, 0); +VNET_DEFINE_STATIC(int, connect_inaddr_wild) = 1; +#define V_connect_inaddr_wild VNET(connect_inaddr_wild) +SYSCTL_INT(_net_inet_ip, OID_AUTO, connect_inaddr_wild, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(connect_inaddr_wild), 0, + "Allow connecting to INADDR_ANY or INADDR_BROADCAST for connect(2)"); + static void in_pcbremhash(struct inpcb *); /* @@ -1309,7 +1315,7 @@ in_pcbconnect_setup(struct inpcb *inp, struct sockaddr_in *sin, inp->inp_flowtype = hash_type; } #endif - if (!CK_STAILQ_EMPTY(&V_in_ifaddrhead)) { + if (V_connect_inaddr_wild && !CK_STAILQ_EMPTY(&V_in_ifaddrhead)) { /* * If the destination address is INADDR_ANY, * use the primary local address. diff --git a/sys/netinet6/in6_pcb.c b/sys/netinet6/in6_pcb.c index e6ec0f24c898..098b4e50483c 100644 --- a/sys/netinet6/in6_pcb.c +++ b/sys/netinet6/in6_pcb.c @@ -83,6 +83,7 @@ #include #include #include +#include #include #include #include @@ -97,6 +98,7 @@ #include #include #include +#include #include #include @@ -112,6 +114,14 @@ #include #include +SYSCTL_DECL(_net_inet6); +SYSCTL_DECL(_net_inet6_ip6); +VNET_DEFINE_STATIC(int, connect_in6addr_wild) = 1; +#define V_connect_in6addr_wild VNET(connect_in6addr_wild) +SYSCTL_INT(_net_inet6_ip6, OID_AUTO, connect_in6addr_wild, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(connect_in6addr_wild), 0, + "Allow connecting to the unspecified address for connect(2)"); + int in6_pcbsetport(struct in6_addr *laddr, struct inpcb *inp, struct ucred *cred) { @@ -351,7 +361,7 @@ in6_pcbladdr(struct inpcb *inp, struct sockaddr_in6 *sin6, if ((error = sa6_embedscope(sin6, V_ip6_use_defzone)) != 0) return(error); - if (!CK_STAILQ_EMPTY(&V_in6_ifaddrhead)) { + if (V_connect_in6addr_wild && !CK_STAILQ_EMPTY(&V_in6_ifaddrhead)) { /* * If the destination address is UNSPECIFIED addr, * use the loopback addr, e.g ::1. From 64443828bbe7c571db8d8731758ec8c4b8364c86 Mon Sep 17 00:00:00 2001 From: Michael Tuexen Date: Wed, 21 Aug 2024 00:07:37 +0200 Subject: [PATCH 082/145] tcp: fix list iteration in tcp_lro_flush_active() Use LIST_FOREACH_SAFE(), since the list element is removed from the list in the loop body, zero out and inserted in the free list. Reviewed by: rrs MFC after: 1 week Sponsored by: Netflix, Inc. Differential Revision: https://reviews.freebsd.org/D46383 --- sys/netinet/tcp_lro.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sys/netinet/tcp_lro.c b/sys/netinet/tcp_lro.c index 2603815f9e61..906e01257a04 100644 --- a/sys/netinet/tcp_lro.c +++ b/sys/netinet/tcp_lro.c @@ -595,7 +595,7 @@ tcp_lro_rx_done(struct lro_ctrl *lc) static void tcp_lro_flush_active(struct lro_ctrl *lc) { - struct lro_entry *le; + struct lro_entry *le, *le_tmp; /* * Walk through the list of le entries, and @@ -607,7 +607,7 @@ tcp_lro_flush_active(struct lro_ctrl *lc) * is being freed. This is ok it will just get * reallocated again like it was new. */ - LIST_FOREACH(le, &lc->lro_active, next) { + LIST_FOREACH_SAFE(le, &lc->lro_active, next, le_tmp) { if (le->m_head != NULL) { tcp_lro_active_remove(le); tcp_lro_flush(lc, le); From aa0bc761d245d2ea1e4b7a0343715cc76859d5da Mon Sep 17 00:00:00 2001 From: Ed Maste Date: Fri, 24 May 2024 12:15:17 -0400 Subject: [PATCH 083/145] bsd.symver.mk: pass $CFLAGS to $CPP invocation This allows us to support symbols optionally available based on configuration, not just on compiler built-in #defines. Reviewed by: brooks, jrtc27 Sponsored by: The FreeBSD Foundation Differential Revision: https://reviews.freebsd.org/D45346 --- share/mk/bsd.symver.mk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/share/mk/bsd.symver.mk b/share/mk/bsd.symver.mk index a1c6e62fba76..d0b63206d8fe 100644 --- a/share/mk/bsd.symver.mk +++ b/share/mk/bsd.symver.mk @@ -44,7 +44,7 @@ _vgen= ${path}/${VERSION_GEN} # Run the symbol maps through the C preprocessor before passing # them to the symbol version generator. ${VERSION_MAP}: ${VERSION_DEF} ${_vgen} ${SYMBOL_MAPS} - cat ${SYMBOL_MAPS} | ${CPP} - - \ + cat ${SYMBOL_MAPS} | ${CPP} ${CFLAGS} - - \ | awk -v vfile=${VERSION_DEF} -f ${_vgen} > ${.TARGET} .endif # !empty(VERSION_DEF) && !empty(SYMBOL_MAPS) .endif # !target(____) From 2477e88b8d4328535357bc62409f673a551be179 Mon Sep 17 00:00:00 2001 From: Rick Macklem Date: Tue, 20 Aug 2024 18:48:19 -0700 Subject: [PATCH 084/145] nfs: Add support for the NFSv4.2 mode_umask attribute RFC8275 defines a new attribute as an extension to NFSv4.2 called MODE_UMASK. This patch adds support for this attribute to the NFSv4.2 client and server. Since FreeBSD applies the umask above the VFS/VOP layer, this attribute does not actually have any effect on the handling of ACL inheritance, which is what it is designed for. However, future changes to NFSv4.2 require support of it, so this patch does that, resulting in behaviour identcal to the mode attribute already supported. MFC after: 2 months --- sys/fs/nfs/nfs_commonsubs.c | 46 +++++++++++++++++++++++---------- sys/fs/nfs/nfscl.h | 9 ++++--- sys/fs/nfs/nfsproto.h | 12 ++++++--- sys/fs/nfsclient/nfs_clrpcops.c | 18 +++++++------ sys/fs/nfsserver/nfs_nfsdport.c | 17 ++++++++++++ 5 files changed, 72 insertions(+), 30 deletions(-) diff --git a/sys/fs/nfs/nfs_commonsubs.c b/sys/fs/nfs/nfs_commonsubs.c index 3c9af40253ad..f0469958a43f 100644 --- a/sys/fs/nfs/nfs_commonsubs.c +++ b/sys/fs/nfs/nfs_commonsubs.c @@ -610,8 +610,18 @@ nfscl_fillsattr(struct nfsrv_descript *nd, struct vattr *vap, break; case ND_NFSV4: NFSZERO_ATTRBIT(&attrbits); - if (vap->va_mode != (mode_t)VNOVAL) - NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_MODE); + np = NULL; + if (strcmp(vp->v_mount->mnt_vfc->vfc_name, "nfs") == 0) + np = VTONFS(vp); + if (vap->va_mode != (mode_t)VNOVAL) { + if ((flags & NFSSATTR_NEWFILE) != 0 && np != NULL && + NFSISSET_ATTRBIT(&np->n_vattr.na_suppattr, + NFSATTRBIT_MODEUMASK)) + NFSSETBIT_ATTRBIT(&attrbits, + NFSATTRBIT_MODEUMASK); + else + NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_MODE); + } if ((flags & NFSSATTR_FULL) && vap->va_uid != (uid_t)VNOVAL) NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_OWNER); if ((flags & NFSSATTR_FULL) && vap->va_gid != (gid_t)VNOVAL) @@ -622,18 +632,14 @@ nfscl_fillsattr(struct nfsrv_descript *nd, struct vattr *vap, NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_TIMEACCESSSET); if (vap->va_mtime.tv_sec != VNOVAL) NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_TIMEMODIFYSET); - if (vap->va_birthtime.tv_sec != VNOVAL && - strcmp(vp->v_mount->mnt_vfc->vfc_name, "nfs") == 0) { - /* - * We can only test for support of TimeCreate if - * the "vp" argument is for an NFS vnode. - */ - np = VTONFS(vp); - if (NFSISSET_ATTRBIT(&np->n_vattr.na_suppattr, - NFSATTRBIT_TIMECREATE)) - NFSSETBIT_ATTRBIT(&attrbits, - NFSATTRBIT_TIMECREATE); - } + /* + * We can only test for support of TimeCreate if + * the "vp" argument is for an NFS vnode. + */ + if (vap->va_birthtime.tv_sec != VNOVAL && np != NULL && + NFSISSET_ATTRBIT(&np->n_vattr.na_suppattr, + NFSATTRBIT_TIMECREATE)) + NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_TIMECREATE); (void) nfsv4_fillattr(nd, vp->v_mount, vp, NULL, vap, NULL, 0, &attrbits, NULL, NULL, 0, 0, 0, 0, (uint64_t)0, NULL); break; @@ -3109,6 +3115,18 @@ nfsv4_fillattr(struct nfsrv_descript *nd, struct mount *mp, vnode_t vp, *tl = newnfs_false; retnum += NFSX_UNSIGNED; break; + case NFSATTRBIT_MODEUMASK: + NFSM_BUILD(tl, uint32_t *, 2 * NFSX_UNSIGNED); + /* + * Since FreeBSD applies the umask above the VFS/VOP, + * there is no umask to handle here. If FreeBSD + * moves handling of umask to below the VFS/VOP, + * this could change. + */ + *tl++ = vtonfsv34_mode(vap->va_mode); + *tl = 0; + retnum += 2 * NFSX_UNSIGNED; + break; default: printf("EEK! Bad V4 attribute bitpos=%d\n", bitpos); } diff --git a/sys/fs/nfs/nfscl.h b/sys/fs/nfs/nfscl.h index a52b9e433145..3b1445e1923c 100644 --- a/sys/fs/nfs/nfscl.h +++ b/sys/fs/nfs/nfscl.h @@ -68,10 +68,11 @@ struct nfsv4node { * These flag bits are used for the argument to nfscl_fillsattr() to * indicate special handling of the attributes. */ -#define NFSSATTR_FULL 0x1 -#define NFSSATTR_SIZE0 0x2 -#define NFSSATTR_SIZENEG1 0x4 -#define NFSSATTR_SIZERDEV 0x8 +#define NFSSATTR_FULL 0x01 +#define NFSSATTR_SIZE0 0x02 +#define NFSSATTR_SIZENEG1 0x04 +#define NFSSATTR_SIZERDEV 0x08 +#define NFSSATTR_NEWFILE 0x10 /* Use this macro for debug printfs. */ #define NFSCL_DEBUG(level, ...) do { \ diff --git a/sys/fs/nfs/nfsproto.h b/sys/fs/nfs/nfsproto.h index 0268940fd8a6..ce7acf102d41 100644 --- a/sys/fs/nfs/nfsproto.h +++ b/sys/fs/nfs/nfsproto.h @@ -1183,7 +1183,8 @@ struct nfsv3_sattr { */ #define NFSATTRBIT_SUPPSETONLY1 (NFSATTRBM_TIMEACCESSSET | \ NFSATTRBM_TIMEMODIFYSET) -#define NFSATTRBIT_SUPPSETONLY2 (NFSATTRBM_MODESETMASKED) +#define NFSATTRBIT_SUPPSETONLY2 (NFSATTRBM_MODESETMASKED | \ + NFSATTRBM_MODEUMASK) /* * NFSATTRBIT_SETABLE - SETABLE0 - bits 0<->31 @@ -1197,11 +1198,12 @@ struct nfsv3_sattr { (NFSATTRBM_MODE | \ NFSATTRBM_OWNER | \ NFSATTRBM_OWNERGROUP | \ - NFSATTRBM_TIMECREATE | \ + NFSATTRBM_TIMECREATE | \ NFSATTRBM_TIMEACCESSSET | \ NFSATTRBM_TIMEMODIFYSET) #define NFSATTRBIT_SETABLE2 \ - (NFSATTRBM_MODESETMASKED) + (NFSATTRBM_MODESETMASKED | \ + NFSATTRBM_MODEUMASK) /* * NFSATTRBIT_NFSV41 - Attributes only supported by NFSv4.1. @@ -1218,7 +1220,9 @@ struct nfsv3_sattr { /* * NFSATTRBIT_NFSV42 - Attributes only supported by NFSv4.2. */ -#define NFSATTRBIT_NFSV42_2 NFSATTRBM_XATTRSUPPORT +#define NFSATTRBIT_NFSV42_2 \ + (NFSATTRBM_XATTRSUPPORT | \ + NFSATTRBM_MODEUMASK) /* * Set of attributes that the getattr vnode op needs. diff --git a/sys/fs/nfsclient/nfs_clrpcops.c b/sys/fs/nfsclient/nfs_clrpcops.c index 8947b608b743..7540893ce63c 100644 --- a/sys/fs/nfsclient/nfs_clrpcops.c +++ b/sys/fs/nfsclient/nfs_clrpcops.c @@ -2423,7 +2423,7 @@ nfsrpc_mknod(vnode_t dvp, char *name, int namelen, struct vattr *vap, *tl = vtonfsv34_type(vtyp); } if (nd->nd_flag & (ND_NFSV3 | ND_NFSV4)) - nfscl_fillsattr(nd, vap, dvp, 0, 0); + nfscl_fillsattr(nd, vap, dvp, NFSSATTR_NEWFILE, 0); if ((nd->nd_flag & ND_NFSV3) && (vtyp == VCHR || vtyp == VBLK)) { NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); @@ -2645,14 +2645,16 @@ nfsrpc_createv4(vnode_t dvp, char *name, int namelen, struct vattr *vap, if (NFSHASSESSPERSIST(nmp)) { /* Use GUARDED for persistent sessions. */ *tl = txdr_unsigned(NFSCREATE_GUARDED); - nfscl_fillsattr(nd, vap, dvp, 0, 0); + nfscl_fillsattr(nd, vap, dvp, NFSSATTR_NEWFILE, + 0); } else { /* Otherwise, use EXCLUSIVE4_1. */ *tl = txdr_unsigned(NFSCREATE_EXCLUSIVE41); NFSM_BUILD(tl, u_int32_t *, NFSX_VERF); *tl++ = cverf.lval[0]; *tl = cverf.lval[1]; - nfscl_fillsattr(nd, vap, dvp, 0, 0); + nfscl_fillsattr(nd, vap, dvp, NFSSATTR_NEWFILE, + 0); } } else { /* NFSv4.0 */ @@ -2663,7 +2665,7 @@ nfsrpc_createv4(vnode_t dvp, char *name, int namelen, struct vattr *vap, } } else { *tl = txdr_unsigned(NFSCREATE_UNCHECKED); - nfscl_fillsattr(nd, vap, dvp, 0, 0); + nfscl_fillsattr(nd, vap, dvp, NFSSATTR_NEWFILE, 0); } NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OPEN_CLAIMNULL); @@ -3238,7 +3240,7 @@ nfsrpc_mkdir(vnode_t dvp, char *name, int namelen, struct vattr *vap, *tl = txdr_unsigned(NFDIR); } (void) nfsm_strtom(nd, name, namelen); - nfscl_fillsattr(nd, vap, dvp, NFSSATTR_SIZENEG1, 0); + nfscl_fillsattr(nd, vap, dvp, NFSSATTR_SIZENEG1 | NFSSATTR_NEWFILE, 0); if (nd->nd_flag & ND_NFSV4) { NFSGETATTR_ATTRBIT(&attrbits); NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); @@ -8456,18 +8458,18 @@ nfsrpc_createlayout(vnode_t dvp, char *name, int namelen, struct vattr *vap, if (NFSHASSESSPERSIST(nmp)) { /* Use GUARDED for persistent sessions. */ *tl = txdr_unsigned(NFSCREATE_GUARDED); - nfscl_fillsattr(nd, vap, dvp, 0, 0); + nfscl_fillsattr(nd, vap, dvp, NFSSATTR_NEWFILE, 0); } else { /* Otherwise, use EXCLUSIVE4_1. */ *tl = txdr_unsigned(NFSCREATE_EXCLUSIVE41); NFSM_BUILD(tl, u_int32_t *, NFSX_VERF); *tl++ = cverf.lval[0]; *tl = cverf.lval[1]; - nfscl_fillsattr(nd, vap, dvp, 0, 0); + nfscl_fillsattr(nd, vap, dvp, NFSSATTR_NEWFILE, 0); } } else { *tl = txdr_unsigned(NFSCREATE_UNCHECKED); - nfscl_fillsattr(nd, vap, dvp, 0, 0); + nfscl_fillsattr(nd, vap, dvp, NFSSATTR_NEWFILE, 0); } NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OPEN_CLAIMNULL); diff --git a/sys/fs/nfsserver/nfs_nfsdport.c b/sys/fs/nfsserver/nfs_nfsdport.c index fa99a76f492e..12592d4c5c59 100644 --- a/sys/fs/nfsserver/nfs_nfsdport.c +++ b/sys/fs/nfsserver/nfs_nfsdport.c @@ -3212,6 +3212,23 @@ nfsv4_sattr(struct nfsrv_descript *nd, vnode_t vp, struct nfsvattr *nvap, nd->nd_repstat = moderet; attrsum += 2 * NFSX_UNSIGNED; break; + case NFSATTRBIT_MODEUMASK: + NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); + mode = fxdr_unsigned(u_short, *tl++); + mask = fxdr_unsigned(u_short, *tl); + /* + * If moderet != 0, mode has already been done. + * If vp != NULL, this is not a file object creation. + */ + if ((nd->nd_flag & ND_NFSV42) == 0) + nd->nd_repstat = NFSERR_ATTRNOTSUPP; + else if ((mask & ~0777) != 0 || vp != NULL || + moderet != 0) + nd->nd_repstat = NFSERR_INVAL; + else + nvap->na_mode = (mode & ~mask); + attrsum += 2 * NFSX_UNSIGNED; + break; default: nd->nd_repstat = NFSERR_ATTRNOTSUPP; /* From 213eb102aeec50b8c236aac1d8f0e0a3f9a99449 Mon Sep 17 00:00:00 2001 From: Steve Kargl Date: Wed, 21 Aug 2024 14:59:07 +0800 Subject: [PATCH 085/145] msun: Fix typo in comment PR: 280965 MFC after: 3 days --- lib/msun/src/math_private.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/msun/src/math_private.h b/lib/msun/src/math_private.h index f3f7985ab7b6..1595f902846c 100644 --- a/lib/msun/src/math_private.h +++ b/lib/msun/src/math_private.h @@ -405,7 +405,7 @@ do { \ * any extra precision into the type of 'a' -- 'a' should have type float_t, * double_t or long double. b's type should be no larger than 'a's type. * Callers should use these types with scopes as large as possible, to - * reduce their own extra-precision and efficiciency problems. In + * reduce their own extra-precision and efficiency problems. In * particular, they shouldn't convert back and forth just to call here. */ #ifdef DEBUG From db6e3260a8b9257aa4f1991867e46973e4f0dce8 Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Mon, 19 Aug 2024 13:46:05 +0100 Subject: [PATCH 086/145] arm64: Remove the E2H check from has_hyp This was added to not use the physical timer when E2H was set. As we now use the correct timer in this case we can remove this extra check. Tested by: kevans Sponsored by: Arm Ltd Differential Revision: https://reviews.freebsd.org/D46086 --- sys/arm64/arm64/genassym.c | 1 - sys/arm64/arm64/locore.S | 1 - sys/arm64/arm64/machdep.c | 9 +-------- sys/arm64/include/machdep.h | 1 - 4 files changed, 1 insertion(+), 11 deletions(-) diff --git a/sys/arm64/arm64/genassym.c b/sys/arm64/arm64/genassym.c index c4f52ae61a77..a4db825e976c 100644 --- a/sys/arm64/arm64/genassym.c +++ b/sys/arm64/arm64/genassym.c @@ -42,7 +42,6 @@ ASSYM(BP_MODULEP, offsetof(struct arm64_bootparams, modulep)); ASSYM(BP_KERN_STACK, offsetof(struct arm64_bootparams, kern_stack)); ASSYM(BP_KERN_TTBR0, offsetof(struct arm64_bootparams, kern_ttbr0)); ASSYM(BP_BOOT_EL, offsetof(struct arm64_bootparams, boot_el)); -ASSYM(BP_HCR_EL2, offsetof(struct arm64_bootparams, hcr_el2)); ASSYM(PCPU_SIZE, sizeof(struct pcpu)); ASSYM(PC_CURPCB, offsetof(struct pcpu, pc_curpcb)); diff --git a/sys/arm64/arm64/locore.S b/sys/arm64/arm64/locore.S index 4252ea3f59f8..ab1fea0c4716 100644 --- a/sys/arm64/arm64/locore.S +++ b/sys/arm64/arm64/locore.S @@ -134,7 +134,6 @@ virtdone: str x25, [x0, #BP_KERN_STACK] str x27, [x0, #BP_KERN_TTBR0] str x23, [x0, #BP_BOOT_EL] - str x4, [x0, #BP_HCR_EL2] #ifdef KASAN /* Save bootparams */ diff --git a/sys/arm64/arm64/machdep.c b/sys/arm64/arm64/machdep.c index 090ea2c10853..c73c6d205c1a 100644 --- a/sys/arm64/arm64/machdep.c +++ b/sys/arm64/arm64/machdep.c @@ -131,7 +131,6 @@ static struct trapframe proc0_tf; int early_boot = 1; int cold = 1; static int boot_el; -static uint64_t hcr_el2; struct kva_md_info kmi; @@ -207,12 +206,7 @@ pan_enable(void) bool has_hyp(void) { - - /* - * XXX The E2H check is wrong, but it's close enough for now. Needs to - * be re-evaluated once we're running regularly in EL2. - */ - return (boot_el == CURRENTEL_EL_EL2 && (hcr_el2 & HCR_E2H) == 0); + return (boot_el == CURRENTEL_EL_EL2); } bool @@ -905,7 +899,6 @@ initarm(struct arm64_bootparams *abp) TSRAW(&thread0, TS_ENTER, __func__, NULL); boot_el = abp->boot_el; - hcr_el2 = abp->hcr_el2; /* Parse loader or FDT boot parametes. Determine last used address. */ lastaddr = parse_boot_param(abp); diff --git a/sys/arm64/include/machdep.h b/sys/arm64/include/machdep.h index 2f2960ae39f2..4fa80219da42 100644 --- a/sys/arm64/include/machdep.h +++ b/sys/arm64/include/machdep.h @@ -33,7 +33,6 @@ struct arm64_bootparams { vm_offset_t modulep; vm_offset_t kern_stack; vm_paddr_t kern_ttbr0; - uint64_t hcr_el2; int boot_el; /* EL the kernel booted from */ int pad; }; From 0054693392f094c035fd396db339b8ceb16f17dd Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Mon, 19 Aug 2024 13:46:10 +0100 Subject: [PATCH 087/145] arm64: Boot into VHE mode when able When FEAT_VHE is present the HCR_EL2 E2H field will be implemented. Try setting this and checking if it's still set to decide if we can boot into VHE mode or not. When it is implemented the kernel will boot into EL2 rather than EL1 it currently boots to. The parts of the kernel that need to know if they are in EL1 or EL2 have been updated other than the CoreSight driver as there doesn't appear to be any way to currently use it and it will be updated soon with the new HWT framework. Sponsored by: Arm Ltd Differential Revision: https://reviews.freebsd.org/D46087 --- sys/arm64/arm64/locore.S | 65 ++++++++++++++++++++++++---------------- 1 file changed, 40 insertions(+), 25 deletions(-) diff --git a/sys/arm64/arm64/locore.S b/sys/arm64/arm64/locore.S index ab1fea0c4716..b71e02538716 100644 --- a/sys/arm64/arm64/locore.S +++ b/sys/arm64/arm64/locore.S @@ -296,8 +296,12 @@ LEND(mpentry_common) #endif /* - * If we are started in EL2, configure the required hypervisor - * registers and drop to EL1. + * Enter the exception level the kernel will use: + * + * - If in EL1 continue in EL1 + * - If the CPU supports FEAT_VHE then set HCR_E2H and HCR_TGE and continue + * in EL2 + * - Configure EL2 to support running the kernel at EL1 and exit to that */ LENTRY(enter_kernel_el) #define INIT_SCTLR_EL1 (SCTLR_LSMAOE | SCTLR_nTLSMD | SCTLR_EIS | \ @@ -335,13 +339,14 @@ LENTRY(enter_kernel_el) isb /* Configure the Hypervisor */ - ldr x2, =(HCR_RW | HCR_APK | HCR_API) + ldr x2, =(HCR_RW | HCR_APK | HCR_API | HCR_E2H) msr hcr_el2, x2 /* Stash value of HCR_EL2 for later */ isb mrs x4, hcr_el2 + /* Load the Virtualization Process ID Register */ mrs x2, midr_el1 msr vpidr_el2, x2 @@ -354,41 +359,51 @@ LENTRY(enter_kernel_el) ldr x2, =INIT_SCTLR_EL1 msr sctlr_el1, x2 + /* Check if the E2H flag is set */ + tst x4, #HCR_E2H + b.eq .Lno_vhe + /* - * On some hardware, e.g., Apple M1, we can't clear E2H, so make sure we - * don't trap to EL2 for SIMD register usage to have at least a - * minimally usable system. + * The kernel will be running in EL2, route exceptions here rather + * than EL1. */ - tst x4, #HCR_E2H - mov x3, #CPTR_RES1 /* HCR_E2H == 0 */ - mov x5, #CPTR_FPEN /* HCR_E2H == 1 */ - csel x2, x3, x5, eq + orr x4, x4, #(HCR_TGE) + msr hcr_el2, x4 + isb + + msr SCTLR_EL12_REG, x2 + ldr x2, =(CPTR_FPEN) + ldr x3, =(CNTHCTL_E2H_EL1PCTEN | CNTHCTL_E2H_EL1PTEN) + ldr x5, =(PSR_DAIF | PSR_M_EL2h) + b .Ldone_vhe + +.Lno_vhe: + /* Hypervisor trap functions */ + adrp x2, hyp_stub_vectors + add x2, x2, :lo12:hyp_stub_vectors + msr vbar_el2, x2 + + ldr x2, =(CPTR_RES1) + ldr x3, =(CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN) + ldr x5, =(PSR_DAIF | PSR_M_EL1h) + +.Ldone_vhe: + msr cptr_el2, x2 + /* Enable access to the physical timers at EL1 */ + msr cnthctl_el2, x3 + /* Set the return PSTATE */ + msr spsr_el2, x5 /* Don't trap to EL2 for CP15 traps */ msr hstr_el2, xzr - /* Enable access to the physical timers at EL1 */ - tst x4, #HCR_E2H - ldr x3, =(CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN) - ldr x5, =(CNTHCTL_E2H_EL1PCTEN | CNTHCTL_E2H_EL1PTEN) - csel x2, x3, x5, eq - msr cnthctl_el2, x2 - /* Set the counter offset to a known value */ msr cntvoff_el2, xzr - /* Hypervisor trap functions */ - adrp x2, hyp_stub_vectors - add x2, x2, :lo12:hyp_stub_vectors - msr vbar_el2, x2 - /* Zero vttbr_el2 so a hypervisor can tell the host and guest apart */ msr vttbr_el2, xzr - mov x2, #(PSR_DAIF | PSR_M_EL1h) - msr spsr_el2, x2 - /* Configure GICv3 CPU interface */ mrs x2, id_aa64pfr0_el1 /* Extract GIC bits from the register */ From 7412517f2947342d599e42dd563fd6b3a7656e29 Mon Sep 17 00:00:00 2001 From: Zhenlei Huang Date: Wed, 21 Aug 2024 18:01:30 +0800 Subject: [PATCH 088/145] init_main: Sprinkle const qualifiers where appropriate No functional change intended. MFC after: 1 week --- sys/kern/init_main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c index 7386a0729835..9d2663015027 100644 --- a/sys/kern/init_main.c +++ b/sys/kern/init_main.c @@ -347,13 +347,13 @@ mi_startup(void) } static void -print_caddr_t(void *data) +print_caddr_t(const void *data) { - printf("%s", (char *)data); + printf("%s", (const char *)data); } static void -print_version(void *data __unused) +print_version(const void *data __unused) { int len; From 78d69d0a3c7f17944a7ecb84509fca6d0aa8545e Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Wed, 21 Aug 2024 09:06:57 +0000 Subject: [PATCH 089/145] arm64: Fix the kernel with options VMM * We can build the non-VHE code with branch protection, it is already build as such in the module. * Use the correct file name for the non-VHE exception .o file. --- sys/conf/files.arm64 | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sys/conf/files.arm64 b/sys/conf/files.arm64 index b105ce873d24..1c863ff98998 100644 --- a/sys/conf/files.arm64 +++ b/sys/conf/files.arm64 @@ -125,13 +125,13 @@ arm64/vmm/vmm_reset.c optional vmm arm64/vmm/vmm_handlers.c optional vmm arm64/vmm/vmm_call.S optional vmm arm64/vmm/vmm_nvhe_exception.S optional vmm \ - compile-with "${NOSAN_C:N-mbranch-protection*} -fpie" \ + compile-with "${NOSAN_C} -fpie" \ no-obj arm64/vmm/vmm_nvhe.c optional vmm \ - compile-with "${NOSAN_C:N-mbranch-protection*} -fpie" \ + compile-with "${NOSAN_C} -fpie" \ no-obj vmm_hyp_blob.elf.full optional vmm \ - dependency "vmm_nvhe.o vmm_hyp_exception.o" \ + dependency "vmm_nvhe.o vmm_nvhe_exception.o" \ compile-with "${SYSTEM_LD_BASECMD} -o ${.TARGET} ${.ALLSRC} --defsym=_start='0x0' --defsym=text_start='0x0'" \ no-obj no-implicit-rule vmm_hyp_blob.elf optional vmm \ From f1bc3750cf9a6623b0c0861984ef2a8ac966a4e3 Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Tue, 20 Aug 2024 18:14:28 +0100 Subject: [PATCH 090/145] arm64: Use store-pair to zero the kernel bss While this won't be noticed by most users the time to zero the bss while using instruction tracing in the Arm FVP models (simulators) is noticeable. Reduce this time by using a store-pair instruction to double the size of memory we zero on each iteration of the loop. Sponsored by: Arm Ltd Differential Revision: https://reviews.freebsd.org/D42733 --- sys/arm64/arm64/locore.S | 2 +- sys/conf/ldscript.arm64 | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/sys/arm64/arm64/locore.S b/sys/arm64/arm64/locore.S index b71e02538716..ae1a005fd68f 100644 --- a/sys/arm64/arm64/locore.S +++ b/sys/arm64/arm64/locore.S @@ -111,7 +111,7 @@ virtdone: ldr x15, .Lbss ldr x14, .Lend 1: - str xzr, [x15], #8 + stp xzr, xzr, [x15], #16 cmp x15, x14 b.lo 1b diff --git a/sys/conf/ldscript.arm64 b/sys/conf/ldscript.arm64 index ea52a3128527..32af035105d0 100644 --- a/sys/conf/ldscript.arm64 +++ b/sys/conf/ldscript.arm64 @@ -121,6 +121,7 @@ SECTIONS .sdata : { *(.sdata) } _edata = .; PROVIDE (edata = .); + . = ALIGN(16); __bss_start = .; .sbss : { *(.sbss) *(.scommon) } .bss : @@ -128,7 +129,7 @@ SECTIONS *(.dynbss) *(.bss) *(COMMON) - . = ALIGN(8); + . = ALIGN(16); __bss_end = .; /* A section for the initial page table, it doesn't need to be in the kernel file, however unlike normal .bss entries should not be zeroed From 024248c933c5741a21c17eda63092f330dd98337 Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Tue, 2 Jan 2024 16:32:08 +0000 Subject: [PATCH 091/145] libc/aarch64: Remove an unneeded weak symbol The index symbol doesn't belong in memcpy.S as it is already in strchr.S where it belongs. Sponsored by: Arm Ltd --- lib/libc/aarch64/string/memcpy.S | 3 --- 1 file changed, 3 deletions(-) diff --git a/lib/libc/aarch64/string/memcpy.S b/lib/libc/aarch64/string/memcpy.S index ac4fbe8d6175..f403dd2e42a8 100644 --- a/lib/libc/aarch64/string/memcpy.S +++ b/lib/libc/aarch64/string/memcpy.S @@ -1,6 +1,3 @@ #define __memcpy_aarch64 memcpy #define __memmove_aarch64 memmove #include "aarch64/memcpy.S" - -.weak index -.equ index, strchr From 828445cc5e18b678ff032325a8dadc59b9bfbea6 Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Wed, 21 Aug 2024 01:20:49 +0300 Subject: [PATCH 092/145] if_vlan: set if_cap{abilities2,enable2} after IFCAP_IPSEC_OFFLOAD is recalculated This makes the vlan IPSEC offload really functional. Noted by: Ariel Ehrenberg Sponsored by: NVidia networking --- sys/net/if_vlan.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sys/net/if_vlan.c b/sys/net/if_vlan.c index 45489138fdef..a30db9173383 100644 --- a/sys/net/if_vlan.c +++ b/sys/net/if_vlan.c @@ -2210,8 +2210,6 @@ vlan_capabilities(struct ifvlan *ifv) ifp->if_capabilities = cap; ifp->if_capenable = ena; - ifp->if_capabilities2 = cap2; - ifp->if_capenable2 = ena2; ifp->if_hwassist = hwa; #ifdef IPSEC_OFFLOAD @@ -2219,6 +2217,9 @@ vlan_capabilities(struct ifvlan *ifv) ena2 |= mena2 & IFCAP2_BIT(IFCAP2_IPSEC_OFFLOAD); ifp->if_ipsec_accel_m = &vlan_if_ipsec_accel_methods; #endif + + ifp->if_capabilities2 = cap2; + ifp->if_capenable2 = ena2; } static void From e7f9171b6738809ded7250bc5c78368421255b1b Mon Sep 17 00:00:00 2001 From: Igor Ostapenko Date: Wed, 21 Aug 2024 12:01:34 +0200 Subject: [PATCH 093/145] pf: Handle m_len < sizeof(struct ether_header) case Reviewed by: kp Differential Revision: https://reviews.freebsd.org/D46391 --- sys/netpfil/pf/pf.c | 7 ++++ tests/sys/netpfil/pf/mbuf.sh | 73 ++++++++++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+) diff --git a/sys/netpfil/pf/pf.c b/sys/netpfil/pf/pf.c index ad2dc2e707ed..cb69d06b1fe6 100644 --- a/sys/netpfil/pf/pf.c +++ b/sys/netpfil/pf/pf.c @@ -4365,6 +4365,13 @@ pf_test_eth_rule(int dir, struct pfi_kkif *kif, struct mbuf **m0) r = TAILQ_FIRST(rules); rm = NULL; + if (__predict_false(m->m_len < sizeof(struct ether_header)) && + (m = *m0 = m_pullup(*m0, sizeof(struct ether_header))) == NULL) { + DPFPRINTF(PF_DEBUG_URGENT, + ("pf_test_eth_rule: m_len < sizeof(struct ether_header)" + ", pullup failed\n")); + return (PF_DROP); + } e = mtod(m, struct ether_header *); proto = ntohs(e->ether_type); diff --git a/tests/sys/netpfil/pf/mbuf.sh b/tests/sys/netpfil/pf/mbuf.sh index 2dffa48ed2f5..a4664718093a 100644 --- a/tests/sys/netpfil/pf/mbuf.sh +++ b/tests/sys/netpfil/pf/mbuf.sh @@ -151,8 +151,81 @@ inet6_in_mbuf_len_cleanup() pft_cleanup } +atf_test_case "ethernet_in_mbuf_len" "cleanup" +ethernet_in_mbuf_len_head() +{ + atf_set descr 'Test that pf can handle inbound with the first mbuf with m_len < sizeof(struct ether_header)' + atf_set require.user root +} +ethernet_in_mbuf_len_body() +{ + pft_init + dummymbuf_init + + epair=$(vnet_mkepair) + epair_a_mac=$(ifconfig ${epair}a ether | awk '/ether/ { print $2; }') + ifconfig ${epair}a 192.0.2.1/24 up + + # Set up a simple jail with one interface + vnet_mkjail alcatraz ${epair}b + jexec alcatraz ifconfig ${epair}b 192.0.2.2/24 up + epair_b_mac=$(jexec alcatraz ifconfig ${epair}b ether | awk '/ether/ { print $2; }') + + # Sanity check + atf_check -s exit:0 -o ignore ping -c1 192.0.2.2 + + # Should be denied + jexec alcatraz pfctl -e + pft_set_rules alcatraz \ + "ether block" \ + "pass" + atf_check -s not-exit:0 -o ignore ping -c1 -t1 192.0.2.2 + + # Should be allowed by from/to addresses + echo $epair_a_mac + echo $epair_b_mac + pft_set_rules alcatraz \ + "ether block" \ + "ether pass in from ${epair_a_mac} to ${epair_b_mac}" \ + "ether pass out from ${epair_b_mac} to ${epair_a_mac}" \ + "pass" + atf_check -s exit:0 -o ignore ping -c1 192.0.2.2 + + # Should still work for m_len=0 + jexec alcatraz pfilctl link -i dummymbuf:ethernet ethernet + jexec alcatraz sysctl net.dummymbuf.rules="ethernet in ${epair}b pull-head 0;" + atf_check_equal "0" "$(jexec alcatraz sysctl -n net.dummymbuf.hits)" + atf_check -s exit:0 -o ignore ping -c1 192.0.2.2 + atf_check_equal "1" "$(jexec alcatraz sysctl -n net.dummymbuf.hits)" + + # m_len=1 + jexec alcatraz sysctl net.dummymbuf.rules="ethernet in ${epair}b pull-head 1;" + jexec alcatraz sysctl net.dummymbuf.hits=0 + atf_check -s exit:0 -o ignore ping -c1 192.0.2.2 + atf_check_equal "1" "$(jexec alcatraz sysctl -n net.dummymbuf.hits)" + + # m_len=11 + # for the simplest L2 Ethernet frame it should impact src field + jexec alcatraz sysctl net.dummymbuf.rules="ethernet in ${epair}b pull-head 11;" + jexec alcatraz sysctl net.dummymbuf.hits=0 + atf_check -s exit:0 -o ignore ping -c1 192.0.2.2 + atf_check_equal "1" "$(jexec alcatraz sysctl -n net.dummymbuf.hits)" + + # m_len=13 + # provided L2 Ethernet simplest header is 14 bytes long, it should impact ethertype field + jexec alcatraz sysctl net.dummymbuf.rules="ethernet in ${epair}b pull-head 13;" + jexec alcatraz sysctl net.dummymbuf.hits=0 + atf_check -s exit:0 -o ignore ping -c1 192.0.2.2 + atf_check_equal "1" "$(jexec alcatraz sysctl -n net.dummymbuf.hits)" +} +ethernet_in_mbuf_len_cleanup() +{ + pft_cleanup +} + atf_init_test_cases() { atf_add_test_case "inet_in_mbuf_len" atf_add_test_case "inet6_in_mbuf_len" + atf_add_test_case "ethernet_in_mbuf_len" } From 1af7d5f389536a2f391153513d95d92ffdf360e4 Mon Sep 17 00:00:00 2001 From: Pietro Cerutti Date: Wed, 21 Aug 2024 12:35:27 +0000 Subject: [PATCH 094/145] libfetch: don't include fragments in HTTP requests Summary: Fragments are reserved for client-side processing, see https://www.rfc-editor.org/rfc/rfc9110.html#section-7.1 Also, some servers don't like to receive HTTP requests with fragments. ``` $ fetch 'https://dropbox.com/a/b' fetch: https://dropbox.com/a/b: Not Found $ fetch 'https://dropbox.com/a/b#' fetch: https://dropbox.com/a/b#: Bad Request ``` This is a real-world scenario, where some download link from dropbox (eventually) redirects to an URL with a fragment: ``` $ fetch -v 'https://www.dropbox.com/sh//?dl=1' 2>&1 | grep requesting requesting https://www.dropbox.com/sh//?dl=1 requesting https://www.dropbox.com/scl/fo//?rlkey=&dl=1 requesting https://.dl.dropboxusercontent.com/zip_download_get/# ``` See how the last redirect ends with a `#`. Currently, libfetch includes the ending fragment and makes it impossible to download the file. Differential Revision: https://reviews.freebsd.org/D46318 MFC after: 2 weeks --- lib/libfetch/fetch.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lib/libfetch/fetch.c b/lib/libfetch/fetch.c index 12cbd0fb746f..97fc04bb09a6 100644 --- a/lib/libfetch/fetch.c +++ b/lib/libfetch/fetch.c @@ -447,7 +447,10 @@ fetchParseURL(const char *URL) goto ouch; } u->doc = doc; - while (*p != '\0') { + /* fragments are reserved for client-side processing, see + * https://www.rfc-editor.org/rfc/rfc9110.html#section-7.1 + */ + while (*p != '\0' && *p != '#') { if (!isspace((unsigned char)*p)) { *doc++ = *p++; } else { From db87c98168b1605f067d283fa36a710369c3849d Mon Sep 17 00:00:00 2001 From: Ed Maste Date: Tue, 20 Aug 2024 14:12:47 -0400 Subject: [PATCH 095/145] ctl: avoid heap info leak in ctl_request_sense Previously 3 bytes of data from the heap could be leaked to ctl consumers. Reported by: Synacktiv Reviewed by: asomers, mav Sponsored by: The Alpha-Omega Project Sponsored by: The FreeBSD Foundation Differential Revision: https://reviews.freebsd.org/D46091 --- sys/cam/ctl/ctl.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/sys/cam/ctl/ctl.c b/sys/cam/ctl/ctl.c index fac65e155890..845cffe77a5d 100644 --- a/sys/cam/ctl/ctl.c +++ b/sys/cam/ctl/ctl.c @@ -9304,14 +9304,8 @@ ctl_request_sense(struct ctl_scsiio *ctsio) sense_ptr = (struct scsi_sense_data *)ctsio->kern_data_ptr; ctsio->kern_sg_entries = 0; ctsio->kern_rel_offset = 0; - - /* - * struct scsi_sense_data, which is currently set to 256 bytes, is - * larger than the largest allowed value for the length field in the - * REQUEST SENSE CDB, which is 252 bytes as of SPC-4. - */ - ctsio->kern_data_len = cdb->length; - ctsio->kern_total_len = cdb->length; + ctsio->kern_data_len = ctsio->kern_total_len = + MIN(cdb->length, sizeof(*sense_ptr)); /* * If we don't have a LUN, we don't have any pending sense. From a3f10d0882e1aebef27698f1e0f94ffadade5935 Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Mon, 12 Aug 2024 06:48:32 +0300 Subject: [PATCH 096/145] rangelocks: add rangelock_free_free() helper to free free list Tested by: markj, pho Sponsored by: The FreeBSD Foundation --- sys/kern/kern_rangelock.c | 38 ++++++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/sys/kern/kern_rangelock.c b/sys/kern/kern_rangelock.c index 20b65778c06d..c2f1f2d762bb 100644 --- a/sys/kern/kern_rangelock.c +++ b/sys/kern/kern_rangelock.c @@ -288,6 +288,8 @@ struct rl_q_entry { static uma_zone_t rl_entry_zone; static smr_t rl_smr; +static void rangelock_free_free(struct rl_q_entry *free); + static void rangelock_sys_init(void) { @@ -392,6 +394,26 @@ rl_e_is_rlock(const struct rl_q_entry *e) return ((e->rl_q_flags & RL_LOCK_TYPE_MASK) == RL_LOCK_READ); } +static void +rangelock_free_free(struct rl_q_entry *free) +{ + struct rl_q_entry *x, *xp; + struct thread *td; + + td = curthread; + for (x = free; x != NULL; x = xp) { + MPASS(!rl_e_is_marked(x)); + xp = x->rl_q_free; + MPASS(!rl_e_is_marked(xp)); + if (td->td_rlqe == NULL) { + smr_synchronize(rl_smr); + td->td_rlqe = x; + } else { + uma_zfree_smr(rl_entry_zone, x); + } + } +} + static void rangelock_unlock_int(struct rangelock *lock, struct rl_q_entry *e) { @@ -623,14 +645,12 @@ static struct rl_q_entry * rangelock_lock_int(struct rangelock *lock, bool trylock, vm_ooffset_t start, vm_ooffset_t end, int locktype) { - struct rl_q_entry *e, *free, *x, *xp; - struct thread *td; + struct rl_q_entry *e, *free; void *cookie; enum RL_INSERT_RES res; if (rangelock_cheat_lock(lock, locktype, trylock, &cookie)) return (cookie); - td = curthread; for (res = RL_LOCK_RETRY; res == RL_LOCK_RETRY;) { free = NULL; e = rlqentry_alloc(start, end, locktype); @@ -643,17 +663,7 @@ rangelock_lock_int(struct rangelock *lock, bool trylock, vm_ooffset_t start, free = e; e = NULL; } - for (x = free; x != NULL; x = xp) { - MPASS(!rl_e_is_marked(x)); - xp = x->rl_q_free; - MPASS(!rl_e_is_marked(xp)); - if (td->td_rlqe == NULL) { - smr_synchronize(rl_smr); - td->td_rlqe = x; - } else { - uma_zfree_smr(rl_entry_zone, x); - } - } + rangelock_free_free(free); } return (e); } From 8a5b2db3d81db16e9e6aaea82cc071bdc766e360 Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Sat, 10 Aug 2024 01:55:36 +0300 Subject: [PATCH 097/145] ranglelock_destroy(): do not remove lock entries from under live lock acquirer Tested by: markj, pho Sponsored by: The FreeBSD Foundation --- sys/kern/kern_rangelock.c | 56 ++++++++++++++++++++++++++++++++------- 1 file changed, 47 insertions(+), 9 deletions(-) diff --git a/sys/kern/kern_rangelock.c b/sys/kern/kern_rangelock.c index c2f1f2d762bb..c01ed05e4a6a 100644 --- a/sys/kern/kern_rangelock.c +++ b/sys/kern/kern_rangelock.c @@ -289,6 +289,7 @@ static uma_zone_t rl_entry_zone; static smr_t rl_smr; static void rangelock_free_free(struct rl_q_entry *free); +static void rangelock_noncheating_destroy(struct rangelock *lock); static void rangelock_sys_init(void) @@ -340,16 +341,9 @@ rangelock_init(struct rangelock *lock) void rangelock_destroy(struct rangelock *lock) { - struct rl_q_entry *e, *ep; - MPASS(!lock->sleepers); - if (rangelock_cheat_destroy(lock)) - return; - for (e = (struct rl_q_entry *)atomic_load_ptr(&lock->head); - e != NULL; e = rl_e_unmark(ep)) { - ep = atomic_load_ptr(&e->rl_q_next); - uma_zfree_smr(rl_entry_zone, e); - } + if (!rangelock_cheat_destroy(lock)) + rangelock_noncheating_destroy(lock); } static bool @@ -487,6 +481,50 @@ rl_q_cas(struct rl_q_entry **prev, struct rl_q_entry *old, (uintptr_t)new) != 0); } +static void +rangelock_noncheating_destroy(struct rangelock *lock) +{ + struct rl_q_entry *cur, *free, *next, **prev; + + free = NULL; +again: + smr_enter(rl_smr); + prev = (struct rl_q_entry **)&lock->head; + cur = rl_q_load(prev); + MPASS(!rl_e_is_marked(cur)); + + for (;;) { + if (cur == NULL) + break; + if (rl_e_is_marked(cur)) + goto again; + + next = rl_q_load(&cur->rl_q_next); + if (rl_e_is_marked(next)) { + next = rl_e_unmark(next); + if (rl_q_cas(prev, cur, next)) { +#ifdef INVARIANTS + cur->rl_q_owner = NULL; +#endif + cur->rl_q_free = free; + free = cur; + cur = next; + continue; + } + smr_exit(rl_smr); + goto again; + } + + sleepq_lock(&lock->sleepers); + if (!rl_e_is_marked(cur)) { + rl_insert_sleep(lock); + goto again; + } + } + smr_exit(rl_smr); + rangelock_free_free(free); +} + enum RL_INSERT_RES { RL_TRYLOCK_FAILED, RL_LOCK_SUCCESS, From e228961d6e3b4a22a3684dc4f688cc3cd3cdff09 Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Sat, 10 Aug 2024 11:12:42 +0300 Subject: [PATCH 098/145] rangelock_destoy(): poison lock->head to trip fault on lock attempt Tested by: markj, pho Sponsored by: The FreeBSD Foundation --- sys/kern/kern_rangelock.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sys/kern/kern_rangelock.c b/sys/kern/kern_rangelock.c index c01ed05e4a6a..992a19701611 100644 --- a/sys/kern/kern_rangelock.c +++ b/sys/kern/kern_rangelock.c @@ -344,6 +344,7 @@ rangelock_destroy(struct rangelock *lock) MPASS(!lock->sleepers); if (!rangelock_cheat_destroy(lock)) rangelock_noncheating_destroy(lock); + DEBUG_POISON_POINTER(*(void **)&lock->head); } static bool From 9467c1a69b81fff65b85f1e142c6dff196e66ba7 Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Sun, 11 Aug 2024 07:25:21 +0300 Subject: [PATCH 099/145] rangelock: assert that we never insert or remove our entry after a logically deleted one Tested by: markj, pho Sponsored by: The FreeBSD Foundation --- sys/kern/kern_rangelock.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sys/kern/kern_rangelock.c b/sys/kern/kern_rangelock.c index 992a19701611..2667bf30fb6f 100644 --- a/sys/kern/kern_rangelock.c +++ b/sys/kern/kern_rangelock.c @@ -478,6 +478,7 @@ static bool rl_q_cas(struct rl_q_entry **prev, struct rl_q_entry *old, struct rl_q_entry *new) { + MPASS(!rl_e_is_marked(old)); return (atomic_cmpset_rel_ptr((uintptr_t *)prev, (uintptr_t)old, (uintptr_t)new) != 0); } @@ -647,6 +648,7 @@ rl_insert(struct rangelock *lock, struct rl_q_entry *e, bool trylock, } } + MPASS(!rl_e_is_marked(cur)); r = rl_e_compare(cur, e); if (r == -1) { prev = &cur->rl_q_next; From a725d61825f32ea00d07a2064431a02fd640313a Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Fri, 16 Aug 2024 08:45:52 +0300 Subject: [PATCH 100/145] rangelock: if CAS for removal failed, restart list iteration Our next pointer is invalid and cannot be followed. Tested by: markj, pho Sponsored by: The FreeBSD Foundation --- sys/kern/kern_rangelock.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/sys/kern/kern_rangelock.c b/sys/kern/kern_rangelock.c index 2667bf30fb6f..d9042f364737 100644 --- a/sys/kern/kern_rangelock.c +++ b/sys/kern/kern_rangelock.c @@ -539,6 +539,7 @@ rl_r_validate(struct rangelock *lock, struct rl_q_entry *e, bool trylock, { struct rl_q_entry *cur, *next, **prev; +again: prev = &e->rl_q_next; cur = rl_q_load(prev); MPASS(!rl_e_is_marked(cur)); /* nobody can unlock e yet */ @@ -551,9 +552,10 @@ rl_r_validate(struct rangelock *lock, struct rl_q_entry *e, bool trylock, if (rl_q_cas(prev, cur, next)) { cur->rl_q_free = *free; *free = cur; + cur = next; + continue; } - cur = next; - continue; + goto again; } if (rl_e_is_rlock(cur)) { prev = &cur->rl_q_next; @@ -583,6 +585,7 @@ rl_w_validate(struct rangelock *lock, struct rl_q_entry *e, { struct rl_q_entry *cur, *next, **prev; +again: prev = (struct rl_q_entry **)&lock->head; cur = rl_q_load(prev); MPASS(!rl_e_is_marked(cur)); /* head is not marked */ @@ -595,9 +598,10 @@ rl_w_validate(struct rangelock *lock, struct rl_q_entry *e, if (rl_q_cas(prev, cur, next)) { cur->rl_q_next = *free; *free = cur; + cur = next; + continue; } - cur = next; - continue; + goto again; } if (cur->rl_q_end <= e->rl_q_start) { prev = &cur->rl_q_next; @@ -642,9 +646,10 @@ rl_insert(struct rangelock *lock, struct rl_q_entry *e, bool trylock, #endif cur->rl_q_free = *free; *free = cur; + cur = next; + continue; } - cur = next; - continue; + goto again; } } From c4d8b2462e771e409d29df5545cbfb8465673818 Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Tue, 20 Aug 2024 15:59:35 +0300 Subject: [PATCH 101/145] rangelocks: recheck that entry is not marked after sleepq is locked in rl_w_validate() otherwise we might loose the wakeup. Reported and tested by: markj Sponsored by: The FreeBSD Foundation --- sys/kern/kern_rangelock.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sys/kern/kern_rangelock.c b/sys/kern/kern_rangelock.c index d9042f364737..ef9d0104af72 100644 --- a/sys/kern/kern_rangelock.c +++ b/sys/kern/kern_rangelock.c @@ -609,6 +609,12 @@ rl_w_validate(struct rangelock *lock, struct rl_q_entry *e, continue; } sleepq_lock(&lock->sleepers); + /* Reload after sleepq is locked */ + next = rl_q_load(&cur->rl_q_next); + if (rl_e_is_marked(next)) { + sleepq_release(&lock->sleepers); + goto again; + } rangelock_unlock_int(lock, e); if (trylock) { sleepq_release(&lock->sleepers); From 40bffb7d2124141b9f2970a4de93cf57f5fbd91b Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Wed, 21 Aug 2024 01:07:10 +0300 Subject: [PATCH 102/145] rangelocks: fix typo in rl_w_validate The freed elements should be threaded using rl_q_free pointer. Reported by: dougm, markj Tested by: markj Sponsored by: The FreeBSD Foundation --- sys/kern/kern_rangelock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sys/kern/kern_rangelock.c b/sys/kern/kern_rangelock.c index ef9d0104af72..e3bb413ab1bb 100644 --- a/sys/kern/kern_rangelock.c +++ b/sys/kern/kern_rangelock.c @@ -596,7 +596,7 @@ rl_w_validate(struct rangelock *lock, struct rl_q_entry *e, if (rl_e_is_marked(next)) { next = rl_e_unmark(next); if (rl_q_cas(prev, cur, next)) { - cur->rl_q_next = *free; + cur->rl_q_free = *free; *free = cur; cur = next; continue; From 45543d3424d46f84a5399879e190fc359dcefbd4 Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Tue, 20 Aug 2024 17:41:33 +0300 Subject: [PATCH 103/145] DMAR: clear dmar_devs[unit] if attach failed This should stop attempts to use a unit which was not completely initialized, but referenced by ACPI DMAR table during scoped devices operions. PR: 280817 Sponsored by: Advanced Micro Devices (AMD) Sponsored by: The FreeBSD Foundation MFC after: 1 week Differential revision: https://reviews.freebsd.org/D46382 --- sys/x86/iommu/intel_drv.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/sys/x86/iommu/intel_drv.c b/sys/x86/iommu/intel_drv.c index 0b25620114cd..636534173715 100644 --- a/sys/x86/iommu/intel_drv.c +++ b/sys/x86/iommu/intel_drv.c @@ -422,6 +422,7 @@ dmar_attach(device_t dev) &unit->reg_rid, RF_ACTIVE); if (unit->regs == NULL) { device_printf(dev, "cannot allocate register window\n"); + dmar_devs[unit->iommu.unit] = NULL; return (ENOMEM); } unit->hw_ver = dmar_read4(unit, DMAR_VER_REG); @@ -449,6 +450,7 @@ dmar_attach(device_t dev) error = dmar_alloc_irq(dev, unit, DMAR_INTR_FAULT); if (error != 0) { dmar_release_resources(dev, unit); + dmar_devs[unit->iommu.unit] = NULL; return (error); } if (DMAR_HAS_QI(unit)) { @@ -463,6 +465,7 @@ dmar_attach(device_t dev) error = dmar_alloc_irq(dev, unit, DMAR_INTR_QI); if (error != 0) { dmar_release_resources(dev, unit); + dmar_devs[unit->iommu.unit] = NULL; return (error); } } @@ -496,12 +499,14 @@ dmar_attach(device_t dev) if (error != 0) { DMAR_UNLOCK(unit); dmar_release_resources(dev, unit); + dmar_devs[unit->iommu.unit] = NULL; return (error); } error = dmar_inv_ctx_glob(unit); if (error != 0) { DMAR_UNLOCK(unit); dmar_release_resources(dev, unit); + dmar_devs[unit->iommu.unit] = NULL; return (error); } if ((unit->hw_ecap & DMAR_ECAP_DI) != 0) { @@ -509,6 +514,7 @@ dmar_attach(device_t dev) if (error != 0) { DMAR_UNLOCK(unit); dmar_release_resources(dev, unit); + dmar_devs[unit->iommu.unit] = NULL; return (error); } } @@ -517,16 +523,19 @@ dmar_attach(device_t dev) error = dmar_init_fault_log(unit); if (error != 0) { dmar_release_resources(dev, unit); + dmar_devs[unit->iommu.unit] = NULL; return (error); } error = dmar_init_qi(unit); if (error != 0) { dmar_release_resources(dev, unit); + dmar_devs[unit->iommu.unit] = NULL; return (error); } error = dmar_init_irt(unit); if (error != 0) { dmar_release_resources(dev, unit); + dmar_devs[unit->iommu.unit] = NULL; return (error); } @@ -542,6 +551,7 @@ dmar_attach(device_t dev) error = iommu_init_busdma(&unit->iommu); if (error != 0) { dmar_release_resources(dev, unit); + dmar_devs[unit->iommu.unit] = NULL; return (error); } @@ -551,6 +561,7 @@ dmar_attach(device_t dev) if (error != 0) { DMAR_UNLOCK(unit); dmar_release_resources(dev, unit); + dmar_devs[unit->iommu.unit] = NULL; return (error); } DMAR_UNLOCK(unit); From 111c7fc2fe21356a637f89fa58c407958f05ad93 Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Tue, 14 May 2024 01:40:07 +0300 Subject: [PATCH 104/145] amd64: add convenience wrappers for INVLPGB and TBLSYNC Reviewed by: alc, markj Tested by: pho Sponsored by: The FreeBSD Foundation MFC after: 1 week Differential revision: https://reviews.freebsd.org/D45191 --- sys/amd64/include/cpufunc.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/sys/amd64/include/cpufunc.h b/sys/amd64/include/cpufunc.h index 62e782304fca..ca53d73b0186 100644 --- a/sys/amd64/include/cpufunc.h +++ b/sys/amd64/include/cpufunc.h @@ -525,6 +525,29 @@ invpcid(struct invpcid_descr *d, int type) : : "r" (d), "r" ((u_long)type) : "memory"); } +#define INVLPGB_VA 0x0001 +#define INVLPGB_PCID 0x0002 +#define INVLPGB_ASID 0x0004 +#define INVLPGB_GLOB 0x0008 +#define INVLPGB_FIN 0x0010 +#define INVLPGB_NEST 0x0020 + +#define INVLPGB_DESCR(asid, pcid) (((pcid) << 16) | (asid)) + +#define INVLPGB_2M_CNT (1u << 31) + +static __inline void +invlpgb(uint64_t rax, uint32_t edx, uint32_t ecx) +{ + __asm __volatile("invlpgb" : : "a" (rax), "d" (edx), "c" (ecx)); +} + +static __inline void +tlbsync(void) +{ + __asm __volatile("tlbsync"); +} + static __inline u_short rfs(void) { From bc4ffcadf2681c954444e1853200dca3f5e65676 Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Tue, 14 May 2024 01:41:17 +0300 Subject: [PATCH 105/145] amd64: add variables indicating INVLPGB works Reviewed by: alc, markj Tested by: pho Sponsored by: The FreeBSD Foundation MFC after: 1 week Differential revision: https://reviews.freebsd.org/D45191 --- sys/amd64/amd64/machdep.c | 6 ++++++ sys/amd64/amd64/pmap.c | 4 ++++ sys/amd64/include/pmap.h | 2 ++ 3 files changed, 12 insertions(+) diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c index c509dcc2f7dd..025c3c365de5 100644 --- a/sys/amd64/amd64/machdep.c +++ b/sys/amd64/amd64/machdep.c @@ -1487,6 +1487,12 @@ hammer_time(u_int64_t modulep, u_int64_t physfree) finishidentcpu(); /* Final stage of CPU initialization */ + invlpgb_works = (amd_extended_feature_extensions & + AMDFEID_INVLPGB) != 0; + TUNABLE_INT_FETCH("vm.pmap.invlpgb_works", &invlpgb_works); + if (invlpgb_works) + invlpgb_maxcnt = cpu_procinfo3 & AMDID_INVLPGB_MAXCNT; + /* * Initialize the clock before the console so that console * initialization can use DELAY(). diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index cf0fc7184f56..ff5c229b7652 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -550,6 +550,10 @@ SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, int invpcid_works = 0; SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0, "Is the invpcid instruction available ?"); +int invlpgb_works; +SYSCTL_INT(_vm_pmap, OID_AUTO, invlpgb_works, CTLFLAG_RD, &invlpgb_works, 0, + "Is the invlpgb instruction available?"); +int invlpgb_maxcnt; int pmap_pcid_invlpg_workaround = 0; SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_invlpg_workaround, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, diff --git a/sys/amd64/include/pmap.h b/sys/amd64/include/pmap.h index 273693e1f782..0819b3bc2945 100644 --- a/sys/amd64/include/pmap.h +++ b/sys/amd64/include/pmap.h @@ -424,6 +424,8 @@ extern vm_offset_t virtual_end; extern vm_paddr_t dmaplimit; extern int pmap_pcid_enabled; extern int invpcid_works; +extern int invlpgb_works; +extern int invlpgb_maxcnt; extern int pmap_pcid_invlpg_workaround; extern int pmap_pcid_invlpg_workaround_uena; From 47656cc1ef1cac307f24de88a4fe23a1389af44e Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Tue, 14 May 2024 01:42:08 +0300 Subject: [PATCH 106/145] amd64: use INVLPGB for kernel pmap invalidations avoiding broadcast IPIs. Reviewed by: alc, markj Tested by: pho Sponsored by: The FreeBSD Foundation MFC after: 1 week Differential revision: https://reviews.freebsd.org/D45191 --- sys/amd64/amd64/mp_machdep.c | 54 ++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c index 91737637b714..12abb8b6bf8b 100644 --- a/sys/amd64/amd64/mp_machdep.c +++ b/sys/amd64/amd64/mp_machdep.c @@ -679,6 +679,20 @@ smp_targeted_tlb_shootdown_native(pmap_t pmap, vm_offset_t addr1, vm_offset_t ad void smp_masked_invltlb(pmap_t pmap, smp_invl_cb_t curcpu_cb) { + if (invlpgb_works && pmap == kernel_pmap) { + invlpgb(INVLPGB_GLOB, 0, 0); + + /* + * TLBSYNC syncs only against INVLPGB executed on the + * same CPU. Since current thread is pinned by + * caller, we do not need to enter critical section to + * prevent migration. + */ + tlbsync(); + sched_unpin(); + return; + } + smp_targeted_tlb_shootdown(pmap, 0, 0, curcpu_cb, invl_op_tlb); #ifdef COUNT_XINVLTLB_HITS ipi_global++; @@ -688,6 +702,13 @@ smp_masked_invltlb(pmap_t pmap, smp_invl_cb_t curcpu_cb) void smp_masked_invlpg(vm_offset_t addr, pmap_t pmap, smp_invl_cb_t curcpu_cb) { + if (invlpgb_works && pmap == kernel_pmap) { + invlpgb(INVLPGB_GLOB | INVLPGB_VA | trunc_page(addr), 0, 0); + tlbsync(); + sched_unpin(); + return; + } + smp_targeted_tlb_shootdown(pmap, addr, 0, curcpu_cb, invl_op_pg); #ifdef COUNT_XINVLTLB_HITS ipi_page++; @@ -698,6 +719,39 @@ void smp_masked_invlpg_range(vm_offset_t addr1, vm_offset_t addr2, pmap_t pmap, smp_invl_cb_t curcpu_cb) { + if (invlpgb_works && pmap == kernel_pmap) { + vm_offset_t va; + uint64_t cnt, total; + + addr1 = trunc_page(addr1); + addr2 = round_page(addr2); + total = atop(addr2 - addr1); + for (va = addr1; total > 0;) { + if ((va & PDRMASK) != 0 || total < NPDEPG) { + cnt = atop(NBPDR - (va & PDRMASK)); + if (cnt > total) + cnt = total; + if (cnt > invlpgb_maxcnt + 1) + cnt = invlpgb_maxcnt + 1; + invlpgb(INVLPGB_GLOB | INVLPGB_VA | va, 0, + cnt - 1); + va += ptoa(cnt); + total -= cnt; + } else { + cnt = total / NPTEPG; + if (cnt > invlpgb_maxcnt + 1) + cnt = invlpgb_maxcnt + 1; + invlpgb(INVLPGB_GLOB | INVLPGB_VA | va, 0, + INVLPGB_2M_CNT | (cnt - 1)); + va += cnt << PDRSHIFT; + total -= cnt * NPTEPG; + } + } + tlbsync(); + sched_unpin(); + return; + } + smp_targeted_tlb_shootdown(pmap, addr1, addr2, curcpu_cb, invl_op_pgrng); #ifdef COUNT_XINVLTLB_HITS From 297a9e552b9a5adf07d195eae9649b0758f395af Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Wed, 21 Aug 2024 13:37:48 -0400 Subject: [PATCH 107/145] libcxxrt: Add a stub implementation of __cxa_call_terminate This function is called by GCC 14 if a destructor invoked during exception unwinding throws an exception. Reviewed by: emaste Obtained from: libcxxrt commit 391a3dcc1054e18c2f0dff4e14d6d79ac95399d7 Differential Revision: https://reviews.freebsd.org/D46004 --- contrib/libcxxrt/exception.cc | 13 +++++++++++++ lib/libcxxrt/Version.map | 4 ++++ lib/libcxxrt/Version.map.arm | 4 ++++ 3 files changed, 21 insertions(+) diff --git a/contrib/libcxxrt/exception.cc b/contrib/libcxxrt/exception.cc index 35ff997dd445..b56333e979a2 100644 --- a/contrib/libcxxrt/exception.cc +++ b/contrib/libcxxrt/exception.cc @@ -1433,6 +1433,19 @@ extern "C" void __cxa_call_unexpected(void*exception) abort(); } +/** + * ABI function, called when an object destructor exits due to an + * exception during stack unwinding. + * + * This function does not return. + */ +extern "C" void __cxa_call_terminate(void *exception) throw() +{ + std::terminate(); + // Should not be reached. + abort(); +} + /** * ABI function, returns the adjusted pointer to the exception object. */ diff --git a/lib/libcxxrt/Version.map b/lib/libcxxrt/Version.map index 012026079e33..02cce34c234f 100644 --- a/lib/libcxxrt/Version.map +++ b/lib/libcxxrt/Version.map @@ -253,6 +253,10 @@ CXXABI_1.3.11 { __cxa_init_primary_exception; } CXXABI_1.3.9; +CXXABI_1.3.15 { + __cxa_call_terminate; +} CXXABI_1.3.11; + CXXRT_1.0 { extern "C++" { diff --git a/lib/libcxxrt/Version.map.arm b/lib/libcxxrt/Version.map.arm index bc4cf68a3654..aef918149b79 100644 --- a/lib/libcxxrt/Version.map.arm +++ b/lib/libcxxrt/Version.map.arm @@ -254,6 +254,10 @@ CXXABI_1.3.11 { __cxa_init_primary_exception; } CXXABI_1.3.9; +CXXABI_1.3.15 { + __cxa_call_terminate; +} CXXABI_1.3.11; + CXXRT_1.0 { extern "C++" { From d668a0b0abe4b079ac2a0d9cd0d7b71bd4a043c6 Mon Sep 17 00:00:00 2001 From: Navdeep Parhar Date: Fri, 26 Jul 2024 15:39:35 -0700 Subject: [PATCH 108/145] cxgbe(4): reset routine for general use. Add a reset_adapter wrapper that picks the most suitable reset routine internally. Use it in the fatal error handler as well as the sysctl based reset. MFC after: 1 week Sponsored by: Chelsio Communications --- sys/dev/cxgbe/t4_main.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/sys/dev/cxgbe/t4_main.c b/sys/dev/cxgbe/t4_main.c index 57c1eeceab22..5fedf10869d2 100644 --- a/sys/dev/cxgbe/t4_main.c +++ b/sys/dev/cxgbe/t4_main.c @@ -2534,6 +2534,15 @@ reset_adapter_with_pl_rst(struct adapter *sc) return (0); } +static inline int +reset_adapter(struct adapter *sc) +{ + if (vm_guest == 0) + return (reset_adapter_with_pci_bus_reset(sc)); + else + return (reset_adapter_with_pl_rst(sc)); +} + static void reset_adapter_task(void *arg, int pending) { @@ -2544,10 +2553,7 @@ reset_adapter_task(void *arg, int pending) if (pending > 1) CH_ALERT(sc, "%s: pending %d\n", __func__, pending); - if (vm_guest == 0) - rc = reset_adapter_with_pci_bus_reset(sc); - else - rc = reset_adapter_with_pl_rst(sc); + rc = reset_adapter(sc); if (rc != 0) { CH_ERR(sc, "adapter did not reset properly, rc = %d, " "flags 0x%08x -> 0x%08x, err_flags 0x%08x -> 0x%08x.\n", @@ -3650,7 +3656,7 @@ fatal_error_task(void *arg, int pending) if (t4_reset_on_fatal_err) { CH_ALERT(sc, "resetting adapter after fatal error.\n"); - rc = reset_adapter_with_pci_bus_reset(sc); + rc = reset_adapter(sc); if (rc == 0 && t4_panic_on_fatal_err) { CH_ALERT(sc, "reset was successful, " "system will NOT panic.\n"); From d48524e21f1a49752485418324538755571ed13f Mon Sep 17 00:00:00 2001 From: Doug Moore Date: Wed, 21 Aug 2024 15:48:59 -0500 Subject: [PATCH 109/145] dev_pager: define free_page for mgt devices Callers of cdev_pager_free_page in the kernel always have object->type == OBJT_MGTDEVICE. Define a function for them to call that skips the runtime type check in cdev_pager_free. Reviewed by: kib Differential Revision: https://reviews.freebsd.org/D46389 --- sys/arm/nvidia/drm2/tegra_bo.c | 2 +- sys/compat/linuxkpi/common/src/linux_page.c | 2 +- sys/dev/drm2/ttm/ttm_bo_vm.c | 2 +- sys/dev/xen/gntdev/gntdev.c | 2 +- sys/dev/xen/privcmd/privcmd.c | 2 +- sys/vm/device_pager.c | 24 +++++++++++++++------ sys/vm/vm_pager.h | 1 + 7 files changed, 24 insertions(+), 11 deletions(-) diff --git a/sys/arm/nvidia/drm2/tegra_bo.c b/sys/arm/nvidia/drm2/tegra_bo.c index 08cd3de6a3fe..346118b78c2b 100644 --- a/sys/arm/nvidia/drm2/tegra_bo.c +++ b/sys/arm/nvidia/drm2/tegra_bo.c @@ -62,7 +62,7 @@ tegra_bo_destruct(struct tegra_bo *bo) for (i = 0; i < bo->npages; i++) { m = bo->m[i]; vm_page_busy_acquire(m, 0); - cdev_pager_free_page(bo->cdev_pager, m); + cdev_mgtdev_pager_free_page(bo->cdev_pager, m); m->flags &= ~PG_FICTITIOUS; vm_page_unwire_noq(m); vm_page_free(m); diff --git a/sys/compat/linuxkpi/common/src/linux_page.c b/sys/compat/linuxkpi/common/src/linux_page.c index d4f8e75a3251..25243382f9ea 100644 --- a/sys/compat/linuxkpi/common/src/linux_page.c +++ b/sys/compat/linuxkpi/common/src/linux_page.c @@ -436,7 +436,7 @@ lkpi_unmap_mapping_range(void *obj, loff_t const holebegin __unused, continue; if (!vm_page_busy_acquire(page, VM_ALLOC_WAITFAIL)) goto retry; - cdev_pager_free_page(devobj, page); + cdev_mgtdev_pager_free_page(devobj, page); } VM_OBJECT_WUNLOCK(devobj); vm_object_deallocate(devobj); diff --git a/sys/dev/drm2/ttm/ttm_bo_vm.c b/sys/dev/drm2/ttm/ttm_bo_vm.c index 4f6c66382453..e543b8dfb993 100644 --- a/sys/dev/drm2/ttm/ttm_bo_vm.c +++ b/sys/dev/drm2/ttm/ttm_bo_vm.c @@ -376,7 +376,7 @@ ttm_bo_release_mmap(struct ttm_buffer_object *bo) continue; if (vm_page_busy_acquire(m, VM_ALLOC_WAITFAIL) == 0) goto retry; - cdev_pager_free_page(vm_obj, m); + cdev_mgtdev_pager_free_page(vm_obj, m); } VM_OBJECT_WUNLOCK(vm_obj); diff --git a/sys/dev/xen/gntdev/gntdev.c b/sys/dev/xen/gntdev/gntdev.c index 4530feb1c76d..49f8aefad62e 100644 --- a/sys/dev/xen/gntdev/gntdev.c +++ b/sys/dev/xen/gntdev/gntdev.c @@ -600,7 +600,7 @@ notify_unmap_cleanup(struct gntdev_gmap *gmap) continue; if (vm_page_busy_acquire(m, VM_ALLOC_WAITFAIL) == 0) goto retry; - cdev_pager_free_page(gmap->map->mem, m); + cdev_mgtdev_pager_free_page(gmap->map->mem, m); } VM_OBJECT_WUNLOCK(gmap->map->mem); diff --git a/sys/dev/xen/privcmd/privcmd.c b/sys/dev/xen/privcmd/privcmd.c index 02e268b23d42..c04ac287183b 100644 --- a/sys/dev/xen/privcmd/privcmd.c +++ b/sys/dev/xen/privcmd/privcmd.c @@ -135,7 +135,7 @@ privcmd_pg_dtor(void *handle) continue; if (vm_page_busy_acquire(m, VM_ALLOC_WAITFAIL) == 0) goto retry; - cdev_pager_free_page(map->mem, m); + cdev_mgtdev_pager_free_page(map->mem, m); } VM_OBJECT_WUNLOCK(map->mem); diff --git a/sys/vm/device_pager.c b/sys/vm/device_pager.c index 4f8651411851..a5be05efc6d9 100644 --- a/sys/vm/device_pager.c +++ b/sys/vm/device_pager.c @@ -262,13 +262,25 @@ void cdev_pager_free_page(vm_object_t object, vm_page_t m) { - VM_OBJECT_ASSERT_WLOCKED(object); - if (object->type == OBJT_MGTDEVICE) { - KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("unmanaged %p", m)); - pmap_remove_all(m); - (void)vm_page_remove(m); - } else if (object->type == OBJT_DEVICE) + if (object->type == OBJT_MGTDEVICE) + cdev_mgtdev_pager_free_page(object, m); + else if (object->type == OBJT_DEVICE) dev_pager_free_page(object, m); + else + KASSERT(false, + ("Invalid device type obj %p m %p", object, m)); +} + +void +cdev_mgtdev_pager_free_page(vm_object_t object, vm_page_t m) +{ + + VM_OBJECT_ASSERT_WLOCKED(object); + KASSERT((object->type == OBJT_MGTDEVICE && + (m->oflags & VPO_UNMANAGED) == 0), + ("Unmanaged device or page obj %p m %p", object, m)); + pmap_remove_all(m); + (void)vm_page_remove(m); } static void diff --git a/sys/vm/vm_pager.h b/sys/vm/vm_pager.h index 7d6b2e96b38c..d30bf349e411 100644 --- a/sys/vm/vm_pager.h +++ b/sys/vm/vm_pager.h @@ -300,6 +300,7 @@ vm_object_t cdev_pager_allocate(void *handle, enum obj_type tp, vm_ooffset_t foff, struct ucred *cred); vm_object_t cdev_pager_lookup(void *handle); void cdev_pager_free_page(vm_object_t object, vm_page_t m); +void cdev_mgtdev_pager_free_page(vm_object_t object, vm_page_t m); struct phys_pager_ops { int (*phys_pg_getpages)(vm_object_t vm_obj, vm_page_t *m, int count, From c0c1b1cd899aae5712786ce73469422fd5d3918a Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Wed, 21 Aug 2024 17:14:56 -0400 Subject: [PATCH 110/145] freebsd32: Fix a few typos in syscalls.conf comments Sponsored by: AFRL, DARPA --- sys/compat/freebsd32/syscalls.conf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sys/compat/freebsd32/syscalls.conf b/sys/compat/freebsd32/syscalls.conf index 055a041cf72a..a6e47cf470f1 100644 --- a/sys/compat/freebsd32/syscalls.conf +++ b/sys/compat/freebsd32/syscalls.conf @@ -18,7 +18,7 @@ abi_ptr_array_t="uint32_t" abi_headers="#include " # -# Variables below this line are exceptions to the ABI changes programatically +# Variables below this line are exceptions to the ABI changes programmatically # detected by makesyscalls.lua. New system calls should not require an entry # here in nearly virtually all cases. New entries are almost certainly # representative of badly designed interfaces. @@ -53,5 +53,5 @@ obsol="getkerninfo" # nlm_syscall - requires significant porting, probably doesn't make sense # nnpfs_syscall - requires significant porting, probably doesn't make sense # ntp_gettime - should be implemented -# thr_create - was unimplemented and appears to be unnecessicary +# thr_create - was unimplemented and appears to be unnecessary unimpl="afs3_syscall kldsym __mac_get_proc __mac_set_proc __mac_get_fd __mac_get_file __mac_set_fd __mac_set_file __mac_get_pid __mac_get_link __mac_set_link __mac_execve nfssvc nlm_syscall ntp_gettime lgetfh nnpfs_syscall thr_create" From d399c791b223953d230423eff1d56fb45a1be77c Mon Sep 17 00:00:00 2001 From: Maxim Konovalov Date: Thu, 22 Aug 2024 00:29:36 +0000 Subject: [PATCH 111/145] bsd-family-tree: NetBSD 8.3 added PR: 280983 --- share/misc/bsd-family-tree | 99 ++++++++++++++++++++------------------ 1 file changed, 51 insertions(+), 48 deletions(-) diff --git a/share/misc/bsd-family-tree b/share/misc/bsd-family-tree index 1d8dacf393be..0849e883e2c3 100644 --- a/share/misc/bsd-family-tree +++ b/share/misc/bsd-family-tree @@ -413,54 +413,56 @@ FreeBSD 5.2 | | | | | | | | | | | | DragonFly 5.6.3 | | | | | | NetBSD | | | | | | | | 8.2 | | - | | | | | | | DragonFly 5.8.1 - | | | | | | OpenBSD 6.7 | - | | FreeBSD | | | | | - | | 11.4 | | | | | - | | | | | | DragonFly 5.8.2 - | | | | | | DragonFly 5.8.3 - | | | | NetBSD 9.1 OpenBSD 6.8 | - | FreeBSD macOS | | | | - | 12.2 11 | | | | - | | | | | | | - | `------. | | | | | - | | | | | | | - *--FreeBSD | | | | | | - | 13.0 | | | NetBSD 9.2 OpenBSD 6.9 DragonFly 6.0.0 - | | | | | | | | - | | | | | | | DragonFly 6.0.1 - | | | | | | | | - | | FreeBSD macOS | | OpenBSD 7.0 | - | | 12.3 12 | | | | - | | | | | | | DragonFly 6.2.1 - | | | | | | OpenBSD 7.1 | - | FreeBSD | | | | | | - | 13.1 | | | | | | - | | | | | | | DragonFly 6.2.2 - | | | | | NetBSD 9.3 | | - | | | macOS | | OpenBSD 7.2 | - | | | 13 | | | | - | | FreeBSD | | | | | - | | 12.4 | | | | | - | | | | | | DragonFly 6.4.0 - | | | | | OpenBSD 7.3 | - | FreeBSD | | | | | - | 13.2 | | | | | - | | | | | | | - | `------. | | | | | - | | macOS | | | | - | | 14 | | | | - | | | | | OpenBSD 7.4 | - *--FreeBSD | | | | | | - | 14.0 | | | | | | - | | | | | | | | - | | FreeBSD | | NetBSD 9.4 | | - | | 13.3 | | | | - | | | *--NetBSD | | - | | | | 10.0 | | - | | | | | | - | | | | OpenBSD 7.5 | - | FreeBSD | | | | + | | | | | | | | DragonFly 5.8.1 + | | | | | | | OpenBSD 6.7 | + | | FreeBSD | | | | | | + | | 11.4 | | | | | | + | | | | | | | DragonFly 5.8.2 + | | | | | | | DragonFly 5.8.3 + | | | | NetBSD | OpenBSD 6.8 | + | FreeBSD macOS | 9.1 | | | + | 12.2 11 | | | | | + | | | | | | | | + | `------. | | | | | | + | | | | | | | | + *--FreeBSD | | | | | | | + | 13.0 | | | NetBSD | OpenBSD 6.9 DragonFly 6.0.0 + | | | | | 9.2 | | | + | | | | | | | | DragonFly 6.0.1 + | | | | | | | | | + | | FreeBSD macOS | | | OpenBSD 7.0 | + | | 12.3 12 | | | | | + | | | | | | | | DragonFly 6.2.1 + | | | | | | | OpenBSD 7.1 | + | FreeBSD | | | | | | | + | 13.1 | | | | | | | + | | | | | | | | DragonFly 6.2.2 + | | | | | NetBSD | | | + | | | macOS | 9.3 | OpenBSD 7.2 | + | | | 13 | | | | | + | | FreeBSD | | | | | | + | | 12.4 | | | | | | + | | | | | | | DragonFly 6.4.0 + | | | | | | OpenBSD 7.3 | + | FreeBSD | | | | | | + | 13.2 | | | | | | + | | | | | | | | + | `------. | | | | | | + | | macOS | | | | | + | | 14 | | | | | + | | | | | | OpenBSD 7.4 | + *--FreeBSD | | | | | | | + | 14.0 | | | | | | | + | | | | | | | | | + | | FreeBSD | | NetBSD | | | + | | 13.3 | | 9.4 | | | + | | | | | | | + | | | *--NetBSD | | | + | | | | 10.0 | | | + | | | | | | | + | | | | | OpenBSD 7.5 | + | | | | NetBSD | | + | FreeBSD | | 8.3 | | | 14.1 | | | | | | | | | FreeBSD 15 -current | NetBSD -current OpenBSD -current DragonFly -current @@ -895,6 +897,7 @@ FreeBSD 13.3 2024-03-05 [FBD] NetBSD 10.0 2024-03-28 [NBD] OpenBSD 7.5 2024-04-05 [OBD] NetBSD 9.4 2024-04-20 [NBD] +NetBSD 8.3 2024-05-04 [NBD] FreeBSD 14.1 2024-06-04 [FBD] Bibliography From a1d9ce19b13f220c5738e6aa58cf0c3750a05526 Mon Sep 17 00:00:00 2001 From: Michael Tuexen Date: Thu, 22 Aug 2024 09:07:27 +0200 Subject: [PATCH 112/145] sctp: fix format of sysctl variables MFC after: 1 week --- sys/netinet/sctp_sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sys/netinet/sctp_sysctl.c b/sys/netinet/sctp_sysctl.c index a4be3471e2fd..a39429ec046e 100644 --- a/sys/netinet/sctp_sysctl.c +++ b/sys/netinet/sctp_sysctl.c @@ -894,7 +894,7 @@ sctp_sysctl_handle_trace_log_clear(SYSCTL_HANDLER_ARGS) return (error); \ } \ SYSCTL_PROC(_net_inet_sctp, OID_AUTO, mib_name, flags, NULL, 0, \ - sctp_sysctl_handle_##mib_name, "UI", prefix##_DESC) + sctp_sysctl_handle_##mib_name, "IU", prefix##_DESC) #define SCTP_UINT_SYSCTL_RDTUN(mib_name, var_name, prefix) \ SYSCTL_UINT(_net_inet_sctp, OID_AUTO, mib_name, \ From 0f64fc6a3486454ad708f517633f930e611fd6d2 Mon Sep 17 00:00:00 2001 From: Zhenlei Huang Date: Thu, 22 Aug 2024 18:00:34 +0800 Subject: [PATCH 113/145] kern: Align the declaration of kernconfstring with its definition It is defined as const char[] in config.c which is auto generated by usr.sbin/config/kernconf.tmpl . While here prefer macro SYSCTL_CONST_STRING to avoid casting. MFC after: 1 week --- sys/kern/kern_mib.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sys/kern/kern_mib.c b/sys/kern/kern_mib.c index fe6e49865682..0132478aa68a 100644 --- a/sys/kern/kern_mib.c +++ b/sys/kern/kern_mib.c @@ -464,10 +464,10 @@ SYSCTL_PROC(_kern, KERN_SECURELVL, securelevel, #ifdef INCLUDE_CONFIG_FILE /* Actual kernel configuration options. */ -extern char kernconfstring[]; +extern const char kernconfstring[]; -SYSCTL_STRING(_kern, OID_AUTO, conftxt, CTLFLAG_RD, - kernconfstring, 0, "Kernel configuration file"); +SYSCTL_CONST_STRING(_kern, OID_AUTO, conftxt, CTLFLAG_RD, + kernconfstring, "Kernel configuration file"); #endif static int From 498286d4e807d6b9e4caad22b96ebca7f16e9b18 Mon Sep 17 00:00:00 2001 From: Michael Tuexen Date: Thu, 22 Aug 2024 14:44:47 +0200 Subject: [PATCH 114/145] tcp: fix format of sysctl variable The format for CTLTYPE_UINT is "IU" instead of "UI" as specified in sysctl.9. Reviewed by: cc, zlei MFC after: 1 week Sponsored by: Netflix, Inc. Differential Revision: https://reviews.freebsd.org/D46408 --- sys/netinet/tcp_syncache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c index 62a4a5a28878..025b071eb1ca 100644 --- a/sys/netinet/tcp_syncache.c +++ b/sys/netinet/tcp_syncache.c @@ -207,7 +207,7 @@ sysctl_net_inet_tcp_syncache_rexmtlimit_check(SYSCTL_HANDLER_ARGS) SYSCTL_PROC(_net_inet_tcp_syncache, OID_AUTO, rexmtlimit, CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(tcp_syncache.rexmt_limit), 0, - sysctl_net_inet_tcp_syncache_rexmtlimit_check, "UI", + sysctl_net_inet_tcp_syncache_rexmtlimit_check, "IU", "Limit on SYN/ACK retransmissions"); VNET_DEFINE(int, tcp_sc_rst_sock_fail) = 1; From 5680cf6dc6e25cffa3930e9cb06f6982fcb80209 Mon Sep 17 00:00:00 2001 From: Brooks Davis Date: Thu, 22 Aug 2024 17:06:58 +0100 Subject: [PATCH 115/145] jemalloc: don't expose 3.0 compat symbols Don't provide default linkage for jemalloc 3.0 compatability symbols. We stopped declaring these interfaces with the introduction of jemalloc 4.0 prior to FreeBSD 11.0. Any code using them would have had to declare them manually so stop declaring them and export the symbols directly for compatability. Arguably they should be x86 only as they were never declared on other Tier-1 architectures. Reviewed by: imp, kib Differential Revision: https://reviews.freebsd.org/D46407 --- .../include/jemalloc/jemalloc_FreeBSD.h | 26 +++++++++++-------- lib/libc/stdlib/malloc/Symbol.map | 10 ------- 2 files changed, 15 insertions(+), 21 deletions(-) diff --git a/contrib/jemalloc/include/jemalloc/jemalloc_FreeBSD.h b/contrib/jemalloc/include/jemalloc/jemalloc_FreeBSD.h index 00848c0c48e3..dee4679838d8 100644 --- a/contrib/jemalloc/include/jemalloc/jemalloc_FreeBSD.h +++ b/contrib/jemalloc/include/jemalloc/jemalloc_FreeBSD.h @@ -118,7 +118,6 @@ extern int __isthreaded; #undef je_malloc_stats_print #undef je_allocm #undef je_rallocm -#undef je_sallocm #undef je_dallocm #undef je_nallocm #define je_malloc __malloc @@ -139,11 +138,6 @@ extern int __isthreaded; #define je_mallctlnametomib __mallctlnametomib #define je_mallctlbymib __mallctlbymib #define je_malloc_stats_print __malloc_stats_print -#define je_allocm __allocm -#define je_rallocm __rallocm -#define je_sallocm __sallocm -#define je_dallocm __dallocm -#define je_nallocm __nallocm #define open _open #define read _read #define write _write @@ -183,9 +177,19 @@ __weak_reference(__mallctl, mallctl); __weak_reference(__mallctlnametomib, mallctlnametomib); __weak_reference(__mallctlbymib, mallctlbymib); __weak_reference(__malloc_stats_print, malloc_stats_print); -__weak_reference(__allocm, allocm); -__weak_reference(__rallocm, rallocm); -__weak_reference(__sallocm, sallocm); -__weak_reference(__dallocm, dallocm); -__weak_reference(__nallocm, nallocm); +__weak_reference(je_allocm, weak_allocm); +__weak_reference(je_rallocm, weak_rallocm); +__weak_reference(je_sallocm, weak_sallocm); +__weak_reference(je_dallocm, weak_dallocm); +__weak_reference(je_nallocm, weak_nallocm); +__sym_compat(__allocm, je_allocm, FBSD_1.3); +__sym_compat(__rallocm, je_rallocm, FBSD_1.3); +__sym_compat(__sallocm, je_sallocm, FBSD_1.3); +__sym_compat(__dallocm, je_dallocm, FBSD_1.3); +__sym_compat(__nallocm, je_nallocm, FBSD_1.3); +__sym_compat(allocm, weak_allocm, FBSD_1.3); +__sym_compat(rallocm, weak_rallocm, FBSD_1.3); +__sym_compat(sallocm, weak_sallocm, FBSD_1.3); +__sym_compat(dallocm, weak_dallocm, FBSD_1.3); +__sym_compat(nallocm, weak_nallocm, FBSD_1.3); #endif diff --git a/lib/libc/stdlib/malloc/Symbol.map b/lib/libc/stdlib/malloc/Symbol.map index 15e4f9e6c69c..d3aa7f3f9988 100644 --- a/lib/libc/stdlib/malloc/Symbol.map +++ b/lib/libc/stdlib/malloc/Symbol.map @@ -23,11 +23,6 @@ FBSD_1.3 { sallocx; dallocx; nallocx; - allocm; - rallocm; - sallocm; - dallocm; - nallocm; __malloc; __calloc; __realloc; @@ -40,11 +35,6 @@ FBSD_1.3 { __sallocx; __dallocx; __nallocx; - __allocm; - __rallocm; - __sallocm; - __dallocm; - __nallocm; }; FBSD_1.4 { From 776cd02b891ccd984963c9ec26f9748d213f0b9b Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Thu, 22 Aug 2024 14:40:48 -0400 Subject: [PATCH 116/145] vmm ppt: Enable busmastering and BAR decoding while a device is assigned Reviewed by: corvink, markj Fixes: f44ff2aba2d6 bhyve: Treat the COMMAND register for PCI passthru devices as emulated Sponsored by: Chelsio Communications Differential Revision: https://reviews.freebsd.org/D46245 --- sys/amd64/vmm/io/ppt.c | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/sys/amd64/vmm/io/ppt.c b/sys/amd64/vmm/io/ppt.c index 3451e91d9de1..9fcfdc7cb441 100644 --- a/sys/amd64/vmm/io/ppt.c +++ b/sys/amd64/vmm/io/ppt.c @@ -151,9 +151,13 @@ static int ppt_attach(device_t dev) { struct pptdev *ppt; + uint16_t cmd; ppt = device_get_softc(dev); + cmd = pci_read_config(dev, PCIR_COMMAND, 2); + cmd &= ~(PCIM_CMD_PORTEN | PCIM_CMD_MEMEN | PCIM_CMD_BUSMASTEREN); + pci_write_config(dev, PCIR_COMMAND, cmd, 2); iommu_remove_device(iommu_host_domain(), pci_get_rid(dev)); num_pptdevs++; TAILQ_INSERT_TAIL(&pptdev_list, ppt, next); @@ -176,7 +180,6 @@ ppt_detach(device_t dev) return (EBUSY); num_pptdevs--; TAILQ_REMOVE(&pptdev_list, ppt, next); - pci_disable_busmaster(dev); if (iommu_host_domain() != NULL) iommu_add_device(iommu_host_domain(), pci_get_rid(dev)); @@ -376,11 +379,28 @@ ppt_pci_reset(device_t dev) pci_power_reset(dev); } +static uint16_t +ppt_bar_enables(struct pptdev *ppt) +{ + struct pci_map *pm; + uint16_t cmd; + + cmd = 0; + for (pm = pci_first_bar(ppt->dev); pm != NULL; pm = pci_next_bar(pm)) { + if (PCI_BAR_IO(pm->pm_value)) + cmd |= PCIM_CMD_PORTEN; + if (PCI_BAR_MEM(pm->pm_value)) + cmd |= PCIM_CMD_MEMEN; + } + return (cmd); +} + int ppt_assign_device(struct vm *vm, int bus, int slot, int func) { struct pptdev *ppt; int error; + uint16_t cmd; /* Passing NULL requires the device to be unowned. */ error = ppt_find(NULL, bus, slot, func, &ppt); @@ -392,6 +412,9 @@ ppt_assign_device(struct vm *vm, int bus, int slot, int func) pci_restore_state(ppt->dev); ppt->vm = vm; iommu_add_device(vm_iommu_domain(vm), pci_get_rid(ppt->dev)); + cmd = pci_read_config(ppt->dev, PCIR_COMMAND, 2); + cmd |= PCIM_CMD_BUSMASTEREN | ppt_bar_enables(ppt); + pci_write_config(ppt->dev, PCIR_COMMAND, cmd, 2); return (0); } @@ -400,11 +423,15 @@ ppt_unassign_device(struct vm *vm, int bus, int slot, int func) { struct pptdev *ppt; int error; + uint16_t cmd; error = ppt_find(vm, bus, slot, func, &ppt); if (error) return (error); + cmd = pci_read_config(ppt->dev, PCIR_COMMAND, 2); + cmd &= ~(PCIM_CMD_PORTEN | PCIM_CMD_MEMEN | PCIM_CMD_BUSMASTEREN); + pci_write_config(ppt->dev, PCIR_COMMAND, cmd, 2); pci_save_state(ppt->dev); ppt_pci_reset(ppt->dev); pci_restore_state(ppt->dev); From 33658afd4e4d11cd71d92e52ca9da5381cdd829b Mon Sep 17 00:00:00 2001 From: Jessica Clarke Date: Thu, 22 Aug 2024 20:36:44 +0100 Subject: [PATCH 117/145] rtld-elf: Pass parsed aux_info to ifunc_init Currently we pass the raw pointer to the on-stack auxargs. This can legitimately have fewer than AT_COUNT entries, so the use of __min_size(AT_COUNT), i.e. static AT_COUNT, is inaccurate, and also needlessly forces the callee to iterate over the elements to find the entry for a given type. Instead we can just pass aux_info like we use for everything else. Note that the argument has been left unused by every callee since its introduction in 4352999e0e6c ("Pass CPUID[1] %edx (cpu_feature), %ecx (cpu_feature2) and CPUID[7].%ebx (cpu_stdext_feature), %ecx (cpu_stdext_feature2) to the ifunc resolvers on x86.") Reviewed by: kib MFC after: 1 month Differential Revision: https://reviews.freebsd.org/D46276 --- libexec/rtld-elf/aarch64/reloc.c | 2 +- libexec/rtld-elf/amd64/reloc.c | 2 +- libexec/rtld-elf/arm/reloc.c | 2 +- libexec/rtld-elf/i386/reloc.c | 2 +- libexec/rtld-elf/powerpc/reloc.c | 2 +- libexec/rtld-elf/powerpc64/reloc.c | 2 +- libexec/rtld-elf/riscv/reloc.c | 2 +- libexec/rtld-elf/rtld.c | 2 +- libexec/rtld-elf/rtld.h | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/libexec/rtld-elf/aarch64/reloc.c b/libexec/rtld-elf/aarch64/reloc.c index f76ce7f4d580..3687c9385326 100644 --- a/libexec/rtld-elf/aarch64/reloc.c +++ b/libexec/rtld-elf/aarch64/reloc.c @@ -452,7 +452,7 @@ reloc_jmpslot(Elf_Addr *where, Elf_Addr target, } void -ifunc_init(Elf_Auxinfo aux_info[__min_size(AT_COUNT)] __unused) +ifunc_init(Elf_Auxinfo *aux_info[__min_size(AT_COUNT)] __unused) { } diff --git a/libexec/rtld-elf/amd64/reloc.c b/libexec/rtld-elf/amd64/reloc.c index 9c5887def356..2e24d6287ea3 100644 --- a/libexec/rtld-elf/amd64/reloc.c +++ b/libexec/rtld-elf/amd64/reloc.c @@ -499,7 +499,7 @@ reloc_gnu_ifunc(Obj_Entry *obj, int flags, RtldLockState *lockstate) uint32_t cpu_feature, cpu_feature2, cpu_stdext_feature, cpu_stdext_feature2; void -ifunc_init(Elf_Auxinfo aux_info[__min_size(AT_COUNT)] __unused) +ifunc_init(Elf_Auxinfo *aux_info[__min_size(AT_COUNT)] __unused) { u_int p[4], cpu_high; diff --git a/libexec/rtld-elf/arm/reloc.c b/libexec/rtld-elf/arm/reloc.c index 4683ebf74fee..b10e6ce844d5 100644 --- a/libexec/rtld-elf/arm/reloc.c +++ b/libexec/rtld-elf/arm/reloc.c @@ -442,7 +442,7 @@ reloc_jmpslot(Elf_Addr *where, Elf_Addr target, } void -ifunc_init(Elf_Auxinfo aux_info[__min_size(AT_COUNT)] __unused) +ifunc_init(Elf_Auxinfo *aux_info[__min_size(AT_COUNT)] __unused) { } diff --git a/libexec/rtld-elf/i386/reloc.c b/libexec/rtld-elf/i386/reloc.c index 04a8354343bc..c91239eb3989 100644 --- a/libexec/rtld-elf/i386/reloc.c +++ b/libexec/rtld-elf/i386/reloc.c @@ -464,7 +464,7 @@ rtld_cpuid_count(int idx, int cnt, u_int *p) } void -ifunc_init(Elf_Auxinfo aux_info[__min_size(AT_COUNT)] __unused) +ifunc_init(Elf_Auxinfo *aux_info[__min_size(AT_COUNT)] __unused) { u_int p[4], cpu_high; int cpuid_supported; diff --git a/libexec/rtld-elf/powerpc/reloc.c b/libexec/rtld-elf/powerpc/reloc.c index 5618efb7f716..82a9669e3e73 100644 --- a/libexec/rtld-elf/powerpc/reloc.c +++ b/libexec/rtld-elf/powerpc/reloc.c @@ -813,7 +813,7 @@ powerpc_abi_variant_hook(Elf_Auxinfo** aux_info) } void -ifunc_init(Elf_Auxinfo aux_info[__min_size(AT_COUNT)] __unused) +ifunc_init(Elf_Auxinfo *aux_info[__min_size(AT_COUNT)] __unused) { } diff --git a/libexec/rtld-elf/powerpc64/reloc.c b/libexec/rtld-elf/powerpc64/reloc.c index 2d06d5821d1b..70acd0ac390d 100644 --- a/libexec/rtld-elf/powerpc64/reloc.c +++ b/libexec/rtld-elf/powerpc64/reloc.c @@ -710,7 +710,7 @@ powerpc64_abi_variant_hook(Elf_Auxinfo** aux_info) } void -ifunc_init(Elf_Auxinfo aux_info[__min_size(AT_COUNT)] __unused) +ifunc_init(Elf_Auxinfo *aux_info[__min_size(AT_COUNT)] __unused) { } diff --git a/libexec/rtld-elf/riscv/reloc.c b/libexec/rtld-elf/riscv/reloc.c index 43522a5a790f..5ea005a813cb 100644 --- a/libexec/rtld-elf/riscv/reloc.c +++ b/libexec/rtld-elf/riscv/reloc.c @@ -376,7 +376,7 @@ reloc_non_plt(Obj_Entry *obj, Obj_Entry *obj_rtld, int flags, } void -ifunc_init(Elf_Auxinfo aux_info[__min_size(AT_COUNT)] __unused) +ifunc_init(Elf_Auxinfo *aux_info[__min_size(AT_COUNT)] __unused) { } diff --git a/libexec/rtld-elf/rtld.c b/libexec/rtld-elf/rtld.c index 1f0c59722ac6..a8db1198a899 100644 --- a/libexec/rtld-elf/rtld.c +++ b/libexec/rtld-elf/rtld.c @@ -904,7 +904,7 @@ _rtld(Elf_Addr *sp, func_ptr_type *exit_proc, Obj_Entry **objp) exit (0); } - ifunc_init(aux); + ifunc_init(aux_info); /* * Setup TLS for main thread. This must be done after the diff --git a/libexec/rtld-elf/rtld.h b/libexec/rtld-elf/rtld.h index 656e980c6261..5527671d647e 100644 --- a/libexec/rtld-elf/rtld.h +++ b/libexec/rtld-elf/rtld.h @@ -438,7 +438,7 @@ int reloc_jmpslots(Obj_Entry *, int flags, struct Struct_RtldLockState *); int reloc_iresolve(Obj_Entry *, struct Struct_RtldLockState *); int reloc_iresolve_nonplt(Obj_Entry *, struct Struct_RtldLockState *); int reloc_gnu_ifunc(Obj_Entry *, int flags, struct Struct_RtldLockState *); -void ifunc_init(Elf_Auxinfo[__min_size(AT_COUNT)]); +void ifunc_init(Elf_Auxinfo *[__min_size(AT_COUNT)]); void init_pltgot(Obj_Entry *); void allocate_initial_tls(Obj_Entry *); From 228a3e73e16983bc7f985b24ef20909500792d3c Mon Sep 17 00:00:00 2001 From: Jessica Clarke Date: Thu, 22 Aug 2024 20:36:44 +0100 Subject: [PATCH 118/145] riscv: Recognise B extension for AT_HWCAP This was ratified earlier this year as an alias for Zba_Zbb_Zbs. Whilst we don't currently export multi-letter extensions, we can still export this alias in AT_HWCAP. Reviewed by: mhorne MFC after: 1 month Differential Revision: https://reviews.freebsd.org/D46277 --- sys/riscv/include/elf.h | 1 + sys/riscv/riscv/identcpu.c | 1 + 2 files changed, 2 insertions(+) diff --git a/sys/riscv/include/elf.h b/sys/riscv/include/elf.h index ff0f36d2efe6..a14d6859902b 100644 --- a/sys/riscv/include/elf.h +++ b/sys/riscv/include/elf.h @@ -82,5 +82,6 @@ __ElfType(Auxinfo); #define HWCAP_ISA_C HWCAP_ISA_BIT('c') #define HWCAP_ISA_G \ (HWCAP_ISA_I | HWCAP_ISA_M | HWCAP_ISA_A | HWCAP_ISA_F | HWCAP_ISA_D) +#define HWCAP_ISA_B HWCAP_ISA_BIT('b') #endif /* !_MACHINE_ELF_H_ */ diff --git a/sys/riscv/riscv/identcpu.c b/sys/riscv/riscv/identcpu.c index c681edfff47b..203edb3689bc 100644 --- a/sys/riscv/riscv/identcpu.c +++ b/sys/riscv/riscv/identcpu.c @@ -245,6 +245,7 @@ parse_riscv_isa(struct cpu_desc *desc, char *isa, int len) while (i < len) { switch(isa[i]) { case 'a': + case 'b': case 'c': case 'd': case 'f': From 729d2b16b74fa5207a12aa1de190bd930432810e Mon Sep 17 00:00:00 2001 From: Jessica Clarke Date: Thu, 22 Aug 2024 20:36:44 +0100 Subject: [PATCH 119/145] rtld-elf: Support IFUNCs on riscv GNU/Linux has historically had the following two resolver prototypes: 1. Elf_Addr(uint64_t, void *) 2. Elf_Addr(uint64_t, void *, void *) For the former, AT_HWCAP is passed in the first argument, and NULL in the second. For the latter, AT_HWCAP is still passed, and the second argument is a pointer to their home-grown __riscv_hwprobe function. Should they want to use the third argument in future, they'll have to introduce yet another prototype to allow for later expansion, and then all users will have to check whether the second argument is NULL to know if the third argument really exists. This is all rather silly and will surely prove fun in the face of type-checking CFI. Instead, be like arm64 and just define all 8 possible general purpose register arguments up front. To naive source code that forgets non-Linux OSes exist this will be compatible with prototype 1 above, since the second argument will be 0 and it won't look further (though should we start using the second argument for something that wouldn't be true any more and it might think it's __riscv_hwprobe, but that incompatibility is one we can defer committing to, and can choose to never adopt). Until the standard interface for querying extension information[1] is settled and implemented in FreeBSD there's not much you can do in a resolver other than use HWCAP_ISA_B, but this gets the infrastructure in place for when that day comes. [1] https://github.com/riscv-non-isa/riscv-c-api-doc/pull/74 Reviewed by: kib, mhorne MFC after: 1 month Differential Revision: https://reviews.freebsd.org/D46278 --- libexec/rtld-elf/riscv/reloc.c | 140 ++++++++++++++++++++++---- libexec/rtld-elf/riscv/rtld_machdep.h | 5 +- 2 files changed, 122 insertions(+), 23 deletions(-) diff --git a/libexec/rtld-elf/riscv/reloc.c b/libexec/rtld-elf/riscv/reloc.c index 5ea005a813cb..aa2cc97ae769 100644 --- a/libexec/rtld-elf/riscv/reloc.c +++ b/libexec/rtld-elf/riscv/reloc.c @@ -152,10 +152,20 @@ reloc_plt(Obj_Entry *obj, int flags __unused, RtldLockState *lockstate __unused) for (rela = obj->pltrela; rela < relalim; rela++) { Elf_Addr *where; - assert(ELF_R_TYPE(rela->r_info) == R_RISCV_JUMP_SLOT); - where = (Elf_Addr *)(obj->relocbase + rela->r_offset); - *where += (Elf_Addr)obj->relocbase; + + switch (ELF_R_TYPE(rela->r_info)) { + case R_RISCV_JUMP_SLOT: + *where += (Elf_Addr)obj->relocbase; + break; + case R_RISCV_IRELATIVE: + obj->irelative = true; + break; + default: + _rtld_error("Unknown relocation type %u in PLT", + (unsigned int)ELF_R_TYPE(rela->r_info)); + return (-1); + } } return (0); @@ -187,6 +197,11 @@ reloc_jmpslots(Obj_Entry *obj, int flags, RtldLockState *lockstate) return (-1); } + if (ELF_ST_TYPE(def->st_info) == STT_GNU_IFUNC) { + obj->gnu_ifunc = true; + continue; + } + *where = (Elf_Addr)(defobj->relocbase + def->st_value); break; default: @@ -199,30 +214,89 @@ reloc_jmpslots(Obj_Entry *obj, int flags, RtldLockState *lockstate) return (0); } +static void +reloc_iresolve_one(Obj_Entry *obj, const Elf_Rela *rela, + RtldLockState *lockstate) +{ + Elf_Addr *where, target, *ptr; + + ptr = (Elf_Addr *)(obj->relocbase + rela->r_addend); + where = (Elf_Addr *)(obj->relocbase + rela->r_offset); + lock_release(rtld_bind_lock, lockstate); + target = call_ifunc_resolver(ptr); + wlock_acquire(rtld_bind_lock, lockstate); + *where = target; +} + int -reloc_iresolve(Obj_Entry *obj __unused, - struct Struct_RtldLockState *lockstate __unused) +reloc_iresolve(Obj_Entry *obj, struct Struct_RtldLockState *lockstate) { + const Elf_Rela *relalim; + const Elf_Rela *rela; + + if (!obj->irelative) + return (0); - /* XXX not implemented */ + obj->irelative = false; + relalim = (const Elf_Rela *)((const char *)obj->pltrela + + obj->pltrelasize); + for (rela = obj->pltrela; rela < relalim; rela++) { + if (ELF_R_TYPE(rela->r_info) == R_RISCV_IRELATIVE) + reloc_iresolve_one(obj, rela, lockstate); + } return (0); } int -reloc_iresolve_nonplt(Obj_Entry *obj __unused, - struct Struct_RtldLockState *lockstate __unused) +reloc_iresolve_nonplt(Obj_Entry *obj, struct Struct_RtldLockState *lockstate) { + const Elf_Rela *relalim; + const Elf_Rela *rela; - /* XXX not implemented */ + if (!obj->irelative_nonplt) + return (0); + + obj->irelative_nonplt = false; + relalim = (const Elf_Rela *)((const char *)obj->rela + obj->relasize); + for (rela = obj->rela; rela < relalim; rela++) { + if (ELF_R_TYPE(rela->r_info) == R_RISCV_IRELATIVE) + reloc_iresolve_one(obj, rela, lockstate); + } return (0); } int -reloc_gnu_ifunc(Obj_Entry *obj __unused, int flags __unused, - struct Struct_RtldLockState *lockstate __unused) +reloc_gnu_ifunc(Obj_Entry *obj, int flags, + struct Struct_RtldLockState *lockstate) { + const Elf_Rela *relalim; + const Elf_Rela *rela; + Elf_Addr *where, target; + const Elf_Sym *def; + const Obj_Entry *defobj; + + if (!obj->gnu_ifunc) + return (0); - /* XXX not implemented */ + relalim = (const Elf_Rela *)((const char *)obj->pltrela + obj->pltrelasize); + for (rela = obj->pltrela; rela < relalim; rela++) { + if (ELF_R_TYPE(rela->r_info) == R_RISCV_JUMP_SLOT) { + where = (Elf_Addr *)(obj->relocbase + rela->r_offset); + def = find_symdef(ELF_R_SYM(rela->r_info), obj, &defobj, + SYMLOOK_IN_PLT | flags, NULL, lockstate); + if (def == NULL) + return (-1); + if (ELF_ST_TYPE(def->st_info) != STT_GNU_IFUNC) + continue; + + lock_release(rtld_bind_lock, lockstate); + target = (Elf_Addr)rtld_resolve_ifunc(defobj, def); + wlock_acquire(rtld_bind_lock, lockstate); + reloc_jmpslot(where, target, defobj, obj, + (const Elf_Rel *)rela); + } + } + obj->gnu_ifunc = false; return (0); } @@ -232,7 +306,8 @@ reloc_jmpslot(Elf_Addr *where, Elf_Addr target, const Elf_Rel *rel) { - assert(ELF_R_TYPE(rel->r_info) == R_RISCV_JUMP_SLOT); + assert(ELF_R_TYPE(rel->r_info) == R_RISCV_JUMP_SLOT || + ELF_R_TYPE(rel->r_info) == R_RISCV_IRELATIVE); if (*where != target && !ld_bind_not) *where = target; @@ -251,13 +326,9 @@ reloc_non_plt(Obj_Entry *obj, Obj_Entry *obj_rtld, int flags, const Elf_Rela *rela; const Elf_Sym *def; SymCache *cache; - Elf_Addr *where; + Elf_Addr *where, symval; unsigned long symnum; - if ((flags & SYMLOOK_IFUNC) != 0) - /* XXX not implemented */ - return (0); - /* * The dynamic loader may be called from a thread, we have * limited amounts of stack available so we cannot use alloca(). @@ -285,8 +356,27 @@ reloc_non_plt(Obj_Entry *obj, Obj_Entry *obj_rtld, int flags, if (def == NULL) return (-1); - *where = (Elf_Addr)(defobj->relocbase + def->st_value + - rela->r_addend); + /* + * If symbol is IFUNC, only perform relocation + * when caller allowed it by passing + * SYMLOOK_IFUNC flag. Skip the relocations + * otherwise. + */ + if (ELF_ST_TYPE(def->st_info) == STT_GNU_IFUNC) { + if ((flags & SYMLOOK_IFUNC) == 0) { + obj->non_plt_gnu_ifunc = true; + continue; + } + symval = (Elf_Addr)rtld_resolve_ifunc(defobj, + def); + } else { + if ((flags & SYMLOOK_IFUNC) != 0) + continue; + symval = (Elf_Addr)(defobj->relocbase + + def->st_value); + } + + *where = symval + rela->r_addend; break; case R_RISCV_TLS_DTPMOD64: def = find_symdef(symnum, obj, &defobj, flags, cache, @@ -365,6 +455,9 @@ reloc_non_plt(Obj_Entry *obj, Obj_Entry *obj_rtld, int flags, case R_RISCV_RELATIVE: *where = (Elf_Addr)(obj->relocbase + rela->r_addend); break; + case R_RISCV_IRELATIVE: + obj->irelative_nonplt = true; + break; default: rtld_printf("%s: Unhandled relocation %lu\n", obj->path, ELF_R_TYPE(rela->r_info)); @@ -375,10 +468,13 @@ reloc_non_plt(Obj_Entry *obj, Obj_Entry *obj_rtld, int flags, return (0); } +unsigned long elf_hwcap; + void -ifunc_init(Elf_Auxinfo *aux_info[__min_size(AT_COUNT)] __unused) +ifunc_init(Elf_Auxinfo *aux_info[__min_size(AT_COUNT)]) { - + if (aux_info[AT_HWCAP] != NULL) + elf_hwcap = aux_info[AT_HWCAP]->a_un.a_val; } void diff --git a/libexec/rtld-elf/riscv/rtld_machdep.h b/libexec/rtld-elf/riscv/rtld_machdep.h index fb5f5643efc6..c6600b583612 100644 --- a/libexec/rtld-elf/riscv/rtld_machdep.h +++ b/libexec/rtld-elf/riscv/rtld_machdep.h @@ -83,8 +83,11 @@ Elf_Addr reloc_jmpslot(Elf_Addr *where, Elf_Addr target, __asm __volatile("mv gp, %0" :: "r"(old1)); \ }) +extern unsigned long elf_hwcap; #define call_ifunc_resolver(ptr) \ - (((Elf_Addr (*)(void))ptr)()) + (((Elf_Addr (*)(unsigned long, unsigned long, unsigned long, \ + unsigned long, unsigned long, unsigned long, unsigned long, \ + unsigned long))ptr)(elf_hwcap, 0, 0, 0, 0, 0, 0, 0)) /* * TLS From 8afae0caf4c4816eb56b732fcd1a4b185e86098a Mon Sep 17 00:00:00 2001 From: Jessica Clarke Date: Thu, 22 Aug 2024 20:36:45 +0100 Subject: [PATCH 120/145] riscv: Add machine/ifunc.h corresponding to rtld-elf's resolver interface Reviewed by: kib, mhorne MFC after: 1 month Differential Revision: https://reviews.freebsd.org/D46279 --- sys/riscv/include/ifunc.h | 49 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 sys/riscv/include/ifunc.h diff --git a/sys/riscv/include/ifunc.h b/sys/riscv/include/ifunc.h new file mode 100644 index 000000000000..0f9747a2aa14 --- /dev/null +++ b/sys/riscv/include/ifunc.h @@ -0,0 +1,49 @@ +/*- + * Copyright (c) 2015-2018 The FreeBSD Foundation + * Copyright (c) 2024 Jessica Clarke + * + * Part of this software was developed by Konstantin Belousov + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef __RISCV_IFUNC_H +#define __RISCV_IFUNC_H + +#define DEFINE_IFUNC(qual, ret_type, name, args) \ + static ret_type (*name##_resolver(void))args __used; \ + qual ret_type name args __attribute__((ifunc(#name "_resolver"))); \ + static ret_type (*name##_resolver(void))args + +#define DEFINE_UIFUNC(qual, ret_type, name, args) \ + static ret_type (*name##_resolver(unsigned long, unsigned long, \ + unsigned long, unsigned long, unsigned long, unsigned long, \ + unsigned long, unsigned long))args __used; \ + qual ret_type name args __attribute__((ifunc(#name "_resolver"))); \ + static ret_type (*name##_resolver(unsigned long elf_hwcap __unused, \ + unsigned long _arg2 __unused, unsigned long _arg3 __unused, \ + unsigned long _arg4 __unused, unsigned long _arg5 __unused, \ + unsigned long _arg6 __unused, unsigned long _arg7 __unused, \ + unsigned long _arg8 __unused))args + +#endif From e41364711ca3f7e214f9607ebedf62e03e51633d Mon Sep 17 00:00:00 2001 From: Michael Tuexen Date: Thu, 22 Aug 2024 22:17:05 +0200 Subject: [PATCH 121/145] tcp: improve consistency of SYN-cache handling Originally, a SYN-cache entry was always allocated and later freed, when not needed anymore. Then the allocation was avoided, when no SYN-cache entry was needed, and a copy on the stack was used. But the logic regarding freeing was not updated. This patch doesn't re-check conditions (which may have changed) when deciding to insert or free the entry, but uses the result of the earlier check. This simplifies the code and improves also consistency. Reviewed by: glebius MFC after: 1 week Sponsored by: Netflix, Inc. Differential Revision: https://reviews.freebsd.org/D46410 --- sys/netinet/tcp_syncache.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c index 025b071eb1ca..33a6a66b7138 100644 --- a/sys/netinet/tcp_syncache.c +++ b/sys/netinet/tcp_syncache.c @@ -1720,9 +1720,7 @@ syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, * Do a standard 3-way handshake. */ if (syncache_respond(sc, m, TH_SYN|TH_ACK) == 0) { - if (V_tcp_syncookies && V_tcp_syncookiesonly && sc != &scs) - syncache_free(sc); - else if (sc != &scs) + if (sc != &scs) syncache_insert(sc, sch); /* locks and unlocks sch */ TCPSTAT_INC(tcps_sndacks); TCPSTAT_INC(tcps_sndtotal); From 6aee1dc48006445b6e062db86c5c3697b04b98f6 Mon Sep 17 00:00:00 2001 From: Maxim Konovalov Date: Thu, 22 Aug 2024 21:03:59 +0000 Subject: [PATCH 122/145] pam_xdg.8: the module option name corrected PR: 280994 --- lib/libpam/modules/pam_xdg/pam_xdg.8 | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/libpam/modules/pam_xdg/pam_xdg.8 b/lib/libpam/modules/pam_xdg/pam_xdg.8 index 1a8b53def051..9b97d3626531 100644 --- a/lib/libpam/modules/pam_xdg/pam_xdg.8 +++ b/lib/libpam/modules/pam_xdg/pam_xdg.8 @@ -22,7 +22,7 @@ .\" * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" * SUCH DAMAGE. -.Dd February 21, 2024 +.Dd August 22, 2024 .Dt PAM_XDG 8 .Os .Sh NAME @@ -42,8 +42,8 @@ By default the directory is created under .Pa /var/run/xdg/ . .Pp The following option may be passed to the authentication module: -.Bl -tag -width ".Cm runtime_dir" -.It Cm runtime_dir Ns = Ns Ar directory +.Bl -tag -width ".Cm runtime_dir_prefix" +.It Cm runtime_dir_prefix Ns = Ns Ar directory Use an alternate base directory .El .Sh SEE ALSO From 101afbc6ee2f06f77e6886f1f3ffe115c579967c Mon Sep 17 00:00:00 2001 From: Warner Losh Date: Thu, 22 Aug 2024 17:28:51 -0600 Subject: [PATCH 123/145] loader/menu: tweak for added line I added a line to the menu, but didn't adjust so things were a line off. Make the necessary adjustments. Fixes: 7cb65be96d47 Sponsored by: Netflix MFC After: 3 days --- stand/lua/drawer.lua | 2 +- stand/lua/menu.lua | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/stand/lua/drawer.lua b/stand/lua/drawer.lua index a009b78164df..3a462930b86b 100644 --- a/stand/lua/drawer.lua +++ b/stand/lua/drawer.lua @@ -472,7 +472,7 @@ logodefs = { brand_position = {x = 2, y = 1} logo_position = {x = 46, y = 4} menu_position = {x = 5, y = 10} -frame_size = {w = 42, h = 13} +frame_size = {w = 42, h = 14} default_shift = {x = 0, y = 0} shift = default_shift diff --git a/stand/lua/menu.lua b/stand/lua/menu.lua index 0587e5ae6586..ed84360d93b0 100644 --- a/stand/lua/menu.lua +++ b/stand/lua/menu.lua @@ -535,7 +535,7 @@ end function menu.autoboot(delay) local x = loader.getenv("loader_menu_timeout_x") or 4 - local y = loader.getenv("loader_menu_timeout_y") or 23 + local y = loader.getenv("loader_menu_timeout_y") or 24 local endtime = loader.time() + delay local time local last From ef3f8aa0a0492487ac7db839de078b1913f61b4c Mon Sep 17 00:00:00 2001 From: Oliver Fromme Date: Thu, 22 Aug 2024 21:46:19 -0700 Subject: [PATCH 124/145] amdsmn(4), amdtemp(4): add support for AMD Ryzen 7 "Phoenix" processors Adds support for AMD Ryzen 7 "Phoenix" processors (family 0x19, model 0x70-0x7f) to the amdsmn(4) and amdtemp(4) drivers. This enables temperature readings of these CPUs via sysctl. The sensors function identically to those for the "Raphael" processors (model 0x60-0x6f); only the PCI device ID differs. PR: kern/280942 Relnotes: yes MFC after: 3 days --- sys/dev/amdsmn/amdsmn.c | 7 +++++++ sys/dev/amdtemp/amdtemp.c | 3 +++ 2 files changed, 10 insertions(+) diff --git a/sys/dev/amdsmn/amdsmn.c b/sys/dev/amdsmn/amdsmn.c index cb2ddbd86c2e..9a0428608a27 100644 --- a/sys/dev/amdsmn/amdsmn.c +++ b/sys/dev/amdsmn/amdsmn.c @@ -60,6 +60,7 @@ #define PCI_DEVICE_ID_AMD_17H_M60H_ROOT 0x1630 #define PCI_DEVICE_ID_AMD_19H_M10H_ROOT 0x14a4 #define PCI_DEVICE_ID_AMD_19H_M60H_ROOT 0x14d8 +#define PCI_DEVICE_ID_AMD_19H_M70H_ROOT 0x14e8 struct pciid; struct amdsmn_softc { @@ -115,6 +116,12 @@ static const struct pciid { .amdsmn_addr_reg = F17H_SMN_ADDR_REG, .amdsmn_data_reg = F17H_SMN_DATA_REG, }, + { + .amdsmn_vendorid = CPU_VENDOR_AMD, + .amdsmn_deviceid = PCI_DEVICE_ID_AMD_19H_M70H_ROOT, + .amdsmn_addr_reg = F17H_SMN_ADDR_REG, + .amdsmn_data_reg = F17H_SMN_DATA_REG, + }, }; /* diff --git a/sys/dev/amdtemp/amdtemp.c b/sys/dev/amdtemp/amdtemp.c index 9ff7388fd70c..ff9866c6221b 100644 --- a/sys/dev/amdtemp/amdtemp.c +++ b/sys/dev/amdtemp/amdtemp.c @@ -115,6 +115,7 @@ struct amdtemp_softc { #define DEVICEID_AMD_HOSTB17H_M60H_ROOT 0x1630 #define DEVICEID_AMD_HOSTB19H_M10H_ROOT 0x14a4 #define DEVICEID_AMD_HOSTB19H_M60H_ROOT 0x14d8 +#define DEVICEID_AMD_HOSTB19H_M70H_ROOT 0x14e8 static const struct amdtemp_product { uint16_t amdtemp_vendorid; @@ -141,6 +142,7 @@ static const struct amdtemp_product { { VENDORID_AMD, DEVICEID_AMD_HOSTB17H_M60H_ROOT, false }, { VENDORID_AMD, DEVICEID_AMD_HOSTB19H_M10H_ROOT, false }, { VENDORID_AMD, DEVICEID_AMD_HOSTB19H_M60H_ROOT, false }, + { VENDORID_AMD, DEVICEID_AMD_HOSTB19H_M70H_ROOT, false }, }; /* @@ -873,6 +875,7 @@ amdtemp_probe_ccd_sensors19h(device_t dev, uint32_t model) _Static_assert((int)NUM_CCDS >= 12, ""); break; case 0x60 ... 0x6f: /* Zen4 Ryzen "Raphael" */ + case 0x70 ... 0x7f: /* Zen4 Ryzen "Phoenix" */ sc->sc_temp_base = AMDTEMP_ZEN4_CCD_TMP_BASE; maxreg = 8; _Static_assert((int)NUM_CCDS >= 8, ""); From abdc7bb79635d1d680053bb2bc73128e15cbb14a Mon Sep 17 00:00:00 2001 From: Li-Wen Hsu Date: Fri, 23 Aug 2024 19:57:09 +0800 Subject: [PATCH 125/145] bhyve(8): Remove mention of -A flag, again Follow 107584716cdeaa36c718a42861df1cb4b8b87e62 to remove the mention of -A flag added in b30a7e5418be5c5d27ac967f5ce99461ab79c37c . Sponsored by: The FreeBSD Foundation --- usr.sbin/bhyve/bhyve.8 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/usr.sbin/bhyve/bhyve.8 b/usr.sbin/bhyve/bhyve.8 index 527ccf720540..8001b5276d51 100644 --- a/usr.sbin/bhyve/bhyve.8 +++ b/usr.sbin/bhyve/bhyve.8 @@ -1166,7 +1166,7 @@ for a virtual machine, use .Fl o .Ar config.dump=1 : .Bd -literal -offset indent -/usr/sbin/bhyve -c 2 -m 256 -A -H -P \\ +/usr/sbin/bhyve -c 2 -m 256 -H -P \\ -s 0:0,hostbridge -s 1:0,virtio-net,tap0 \\ -s 2:0,ahci-hd,./vm0.img \\ -s 31,lpc -l com1,stdio \\ From bd8486b9fe74775c2579ea758e1bf9563eb3cfba Mon Sep 17 00:00:00 2001 From: "Simon J. Gerraty" Date: Fri, 23 Aug 2024 10:31:16 -0700 Subject: [PATCH 126/145] kern.pre.mk include local.kern.pre.mk Allow for local customization. Reviewed by: stevek Differential Revision: https://reviews.freebsd.org/D46423 --- sys/conf/kern.pre.mk | 1 + 1 file changed, 1 insertion(+) diff --git a/sys/conf/kern.pre.mk b/sys/conf/kern.pre.mk index e787d023d9a9..c676418aca93 100644 --- a/sys/conf/kern.pre.mk +++ b/sys/conf/kern.pre.mk @@ -17,6 +17,7 @@ _srcconf_included_: .include .include .include "kern.opts.mk" +.-include # The kernel build always occurs in the object directory which is .CURDIR. .if ${.MAKE.MODE:Unormal:Mmeta} From 70174ef7d2c80abdfca0e3ad9d0bb1af61318542 Mon Sep 17 00:00:00 2001 From: Brooks Davis Date: Fri, 23 Aug 2024 18:57:59 +0100 Subject: [PATCH 127/145] proccontrol: make -s require a target Require a command to exec or a pid to target and update usage and the manpage to make this more clear. It makes no sense to invoke a procctl(2) command on the current process only to exit. Users are sometimes confused about how proccontrol works and think it effects their shell environment when invoked without a target. Disallowing this nonsensical behavior and clarifiying usage will hopefully reduce confusion. Reviewed by: kib Differential Revision: https://reviews.freebsd.org/D46422 --- usr.bin/proccontrol/proccontrol.1 | 7 +++++-- usr.bin/proccontrol/proccontrol.c | 14 ++++++++++---- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/usr.bin/proccontrol/proccontrol.1 b/usr.bin/proccontrol/proccontrol.1 index 7ab917e4a61f..ee71c3200b1a 100644 --- a/usr.bin/proccontrol/proccontrol.1 +++ b/usr.bin/proccontrol/proccontrol.1 @@ -34,9 +34,12 @@ .Sh SYNOPSIS .Nm .Fl m Ar mode -.Op Fl s Ar control -.Op Fl q +.Fl s Ar control .Fl p Ar pid | command +.Nm +.Fl m Ar mode +.Fl q +.Op Fl p Ar pid | command .Sh DESCRIPTION The .Nm diff --git a/usr.bin/proccontrol/proccontrol.c b/usr.bin/proccontrol/proccontrol.c index 4b7543d63793..be78e14fd75e 100644 --- a/usr.bin/proccontrol/proccontrol.c +++ b/usr.bin/proccontrol/proccontrol.c @@ -81,10 +81,14 @@ str2pid(const char *str) static void __dead2 usage(void) { - - fprintf(stderr, "Usage: proccontrol -m (aslr|protmax|trace|trapcap|" - "stackgap|nonewprivs|wxmap"KPTI_USAGE LA_USAGE") [-q] " - "[-s (enable|disable)] [-p pid | command]\n"); + fprintf(stderr, "Usage:\n"); + fprintf(stderr, " proccontrol -m mode -s (enable|disable) " + "(-p pid | command)\n"); + fprintf(stderr, " proccontrol -m mode -q [-p pid]\n"); + fprintf(stderr, "Modes: " + "aslr|protmax|trace|trapcap|stackgap|nonewprivs|wxmap" + KPTI_USAGE LA_USAGE + "\n"); exit(1); } @@ -157,6 +161,8 @@ main(int argc, char *argv[]) usage(); pid = getpid(); } else if (pid == -1) { + if (!query) + usage(); pid = getpid(); } From 5cbb98c8259c48ba22c8359f4c14f5438329ce58 Mon Sep 17 00:00:00 2001 From: Brooks Davis Date: Fri, 23 Aug 2024 18:59:46 +0100 Subject: [PATCH 128/145] proccontrol.1: bump doc date Fixes: 70174ef7d2c8 proccontrol: make -s require a target --- usr.bin/proccontrol/proccontrol.1 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/usr.bin/proccontrol/proccontrol.1 b/usr.bin/proccontrol/proccontrol.1 index ee71c3200b1a..7c4330fb1166 100644 --- a/usr.bin/proccontrol/proccontrol.1 +++ b/usr.bin/proccontrol/proccontrol.1 @@ -25,7 +25,7 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" -.Dd October 5, 2023 +.Dd August 23, 2024 .Dt PROCCONTROL 1 .Os .Sh NAME From e972e408d19a58c4f2855eface487d06ef73b799 Mon Sep 17 00:00:00 2001 From: Gordon Tetlow Date: Fri, 23 Aug 2024 22:35:07 -0700 Subject: [PATCH 129/145] release: Redirect etcupdate logfile to /dev/null. Stop shipping a log file for etcupdate. This is a source of non-reproducability as it uses mktemp thereby guaranteeing the output is different each run. MFC after: 1 week Differential Revision: https://reviews.freebsd.org/D46317 --- release/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/release/Makefile b/release/Makefile index a03951d17499..6f59647ff5db 100644 --- a/release/Makefile +++ b/release/Makefile @@ -123,7 +123,7 @@ base.txz: sh ${WORLDDIR}/usr.sbin/etcupdate/etcupdate.sh extract -B \ -m "${MAKE}" -M "TARGET_ARCH=${TARGET_ARCH} TARGET=${TARGET}" \ -s ${WORLDDIR} -d "${.OBJDIR}/${DISTDIR}/base/var/db/etcupdate" \ - ${NO_ROOT:D-N} + -L /dev/null ${NO_ROOT:D-N} .if defined(NO_ROOT) echo "./var/db/etcupdate type=dir uname=root gname=wheel mode=0755" >> ${.OBJDIR}/${DISTDIR}/base.meta sed -n 's,^\.,./var/db/etcupdate/current,p' ${.OBJDIR}/${DISTDIR}/base/var/db/etcupdate/current/METALOG \ From ca2c1968611123ca7e2369bddd47b5c43ffaecea Mon Sep 17 00:00:00 2001 From: Jose Luis Duran Date: Thu, 22 Aug 2024 03:23:59 +0000 Subject: [PATCH 130/145] mk: Add a BTI-report linker feature Add support for specifying how to report the missing Branch Target Identification (BTI) linker feature on AArch64. For: Kernel: bti-report on when the linker supports it Userspace: bti-report on when the linker supports it and BTI_REPORT_ERROR is defined Fixes: 43e8849bc294 ("conf: Enable BTI checking in the arm64 kernel") Pull Request: https://github.com/freebsd/freebsd-src/pull/1393 (cherry picked from commit 973bbdab47035ebd16200c63d095904924dc44d9) --- share/mk/bsd.lib.mk | 2 +- share/mk/bsd.linker.mk | 11 ++++++++--- share/mk/bsd.prog.mk | 2 +- sys/conf/kern.mk | 2 +- 4 files changed, 11 insertions(+), 6 deletions(-) diff --git a/share/mk/bsd.lib.mk b/share/mk/bsd.lib.mk index a1927181de14..18b6ad8b04e1 100644 --- a/share/mk/bsd.lib.mk +++ b/share/mk/bsd.lib.mk @@ -107,7 +107,7 @@ LDFLAGS+= -Wl,-zretpolineplt LDFLAGS.bfd+= -Wl,-znoexecstack .if ${MK_BRANCH_PROTECTION} != "no" CFLAGS+= -mbranch-protection=standard -.if ${MACHINE_ARCH} == "aarch64" && defined(BTI_REPORT_ERROR) +.if ${LINKER_FEATURES:Mbti-report} && defined(BTI_REPORT_ERROR) LDFLAGS+= -Wl,-zbti-report=error .endif .endif diff --git a/share/mk/bsd.linker.mk b/share/mk/bsd.linker.mk index 344de2aa6a40..f27755976761 100644 --- a/share/mk/bsd.linker.mk +++ b/share/mk/bsd.linker.mk @@ -11,9 +11,11 @@ # LINKER_FEATURES may contain one or more of the following, based on # linker support for that feature: # -# - build-id: support for generating a Build-ID note -# - retpoline: support for generating PLT with retpoline speculative -# execution vulnerability mitigation +# - build-id: support for generating a Build-ID note +# - retpoline: support for generating PLT with retpoline speculative +# execution vulnerability mitigation +# - bti-report: support for specifying how to report the missing +# Branch Target Identification (BTI) property (AArch64) # # LINKER_FREEBSD_VERSION is the linker's internal source version. # @@ -140,6 +142,9 @@ ${X_}LINKER_FEATURES+= ifunc-noplt # If we are using lld 10.0 or newer we can use -Wl,--gdb-index without crashing ${X_}LINKER_FEATURES+= gdb-index .endif +.if ${${X_}LINKER_TYPE} == "lld" && ${${X_}LINKER_VERSION} >= 140000 +${X_}LINKER_FEATURES+= bti-report +.endif .endif .else # Use LD's values diff --git a/share/mk/bsd.prog.mk b/share/mk/bsd.prog.mk index 237794ccf3d2..028e4bbfe882 100644 --- a/share/mk/bsd.prog.mk +++ b/share/mk/bsd.prog.mk @@ -79,7 +79,7 @@ LDFLAGS+= -Wl,-zretpolineplt LDFLAGS.bfd+= -Wl,-znoexecstack .if ${MK_BRANCH_PROTECTION} != "no" CFLAGS+= -mbranch-protection=standard -.if ${MACHINE_ARCH} == "aarch64" && defined(BTI_REPORT_ERROR) +.if ${LINKER_FEATURES:Mbti-report} && defined(BTI_REPORT_ERROR) LDFLAGS+= -Wl,-zbti-report=error .endif .endif diff --git a/sys/conf/kern.mk b/sys/conf/kern.mk index 80c7c4f8ce76..e1a276d81a63 100644 --- a/sys/conf/kern.mk +++ b/sys/conf/kern.mk @@ -143,7 +143,7 @@ CFLAGS += -mgeneral-regs-only CFLAGS += -ffixed-x18 # Build with BTI+PAC CFLAGS += -mbranch-protection=standard -.if ${LINKER_TYPE} == "lld" +.if ${LINKER_FEATURES:Mbti-report} LDFLAGS += -Wl,-zbti-report=error .endif # TODO: support outline atomics From 1155f27699177206a99ee21145ae3599cef54db8 Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Thu, 21 Nov 2024 13:32:15 -0500 Subject: [PATCH 131/145] md: Fix linking of embedded filesystem images on aarch64 embedfs.S needs the right aarch64 features for BTI and/or PAC. Fixes: c2e0d56f5e49 ("arm64: Support BTI checking in most of the kernel") --- sys/dev/md/embedfs.S | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sys/dev/md/embedfs.S b/sys/dev/md/embedfs.S index 33f37cd04ae1..033c73391938 100644 --- a/sys/dev/md/embedfs.S +++ b/sys/dev/md/embedfs.S @@ -42,3 +42,9 @@ mfs_root: .type mfs_root_end, %object mfs_root_end: .size mfs_root_end, . - mfs_root_end + +#if defined(__aarch64__) +#include +#include +GNU_PROPERTY_AARCH64_FEATURE_1_NOTE(GNU_PROPERTY_AARCH64_FEATURE_1_VAL) +#endif From 1d5719cf645ef64804da5909b817904003c93f8d Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Mon, 2 Sep 2024 16:36:13 +0100 Subject: [PATCH 132/145] arm64: Trap floating-point registers with VHE When VHE is enabled we disabled trapping floating-point instructions to EL2 in locore.S. As the kernel is running in EL2 then vfp.c will manage enabling floating-point instructions as needed. Sponsored by: Arm Ltd (cherry picked from commit 769eeb09325419ccbc7fbd1c22afc9b1e1e191f3) --- sys/arm64/arm64/locore.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sys/arm64/arm64/locore.S b/sys/arm64/arm64/locore.S index f9f0e3f8af53..d8b1ae15967a 100644 --- a/sys/arm64/arm64/locore.S +++ b/sys/arm64/arm64/locore.S @@ -563,7 +563,7 @@ LENTRY(enter_kernel_el) isb msr SCTLR_EL12_REG, x2 - ldr x2, =(CPTR_FPEN) + mov x2, xzr /* CPTR_EL2 is managed by vfp.c */ ldr x3, =(CNTHCTL_E2H_EL1PCTEN | CNTHCTL_E2H_EL1PTEN) ldr x5, =(PSR_DAIF | PSR_M_EL2h) b .Ldone_vhe From 56f86efdb32b328090771f71f5ef07ccf3b74c9c Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Wed, 11 Sep 2024 10:38:08 +0100 Subject: [PATCH 133/145] arm64: Adjust the indentation of CPTR_EL2 values Reviewed by: emaste Sponsored by: Arm Ltd Differential Revision: https://reviews.freebsd.org/D46513 (cherry picked from commit 7a488d83b3af4d59946319b251a3a2060f18df40) --- sys/arm64/include/hypervisor.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/sys/arm64/include/hypervisor.h b/sys/arm64/include/hypervisor.h index da723130c4ef..c85809deadbe 100644 --- a/sys/arm64/include/hypervisor.h +++ b/sys/arm64/include/hypervisor.h @@ -53,19 +53,19 @@ /* CPTR_EL2 - Architecture feature trap register */ /* Valid if HCR_EL2.E2H == 0 */ -#define CPTR_RES0 0x7fefc800 +#define CPTR_RES0 0x7fefc800 #if __has_feature(capabilities) -#define CPTR_RES1 0x000031ff -#define CPTR_TC 0x00000200 /* Trap Capabilities */ +#define CPTR_RES1 0x000031ff +#define CPTR_TC 0x00000200 /* Trap Capabilities */ #else -#define CPTR_RES1 0x000033ff +#define CPTR_RES1 0x000033ff #endif -#define CPTR_TFP 0x00000400 +#define CPTR_TFP 0x00000400 /* Valid if HCR_EL2.E2H == 1 */ -#define CPTR_FPEN 0x00300000 +#define CPTR_FPEN 0x00300000 /* Unconditionally valid */ -#define CPTR_TTA 0x00100000 -#define CPTR_TCPAC 0x80000000 +#define CPTR_TTA 0x00100000 +#define CPTR_TCPAC 0x80000000 /* HCR_EL2 - Hypervisor Config Register */ #define HCR_VM (UL(0x1) << 0) From fc37b7074729d7bae3357200d2246c69ef8d14e8 Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Wed, 11 Sep 2024 10:38:15 +0100 Subject: [PATCH 134/145] arm64: Add E2H to CPTR_EL2 register values Rename register fields that are only valid when HCR_EL2.E2H == 1. Some fields move around depending on the value of the E2H field. Sponsored by: Arm Ltd Differential Revision: https://reviews.freebsd.org/D46514 (cherry picked from commit 16e661921a9bbc6aab455c59da055b6f4ff75627) --- sys/arm64/include/hypervisor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sys/arm64/include/hypervisor.h b/sys/arm64/include/hypervisor.h index c85809deadbe..00f2e089b20c 100644 --- a/sys/arm64/include/hypervisor.h +++ b/sys/arm64/include/hypervisor.h @@ -62,7 +62,7 @@ #endif #define CPTR_TFP 0x00000400 /* Valid if HCR_EL2.E2H == 1 */ -#define CPTR_FPEN 0x00300000 +#define CPTR_E2H_FPEN 0x00300000 /* Unconditionally valid */ #define CPTR_TTA 0x00100000 #define CPTR_TCPAC 0x80000000 From 924f1e9d32959558fc99571213395ad2b9285b4f Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Wed, 11 Sep 2024 10:38:22 +0100 Subject: [PATCH 135/145] arm64: Add CPTR_E2H_TTA The TTA field moves depending on the HCR_EL2.E2H field. Add a macro to hold the E2H == 1 case. Sponsored by: Arm Ltd Differential Revision: https://reviews.freebsd.org/D46515 (cherry picked from commit 9f3d15fda29a9d510754daed8e6158c637108b42) --- sys/arm64/include/hypervisor.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sys/arm64/include/hypervisor.h b/sys/arm64/include/hypervisor.h index 00f2e089b20c..16480b560617 100644 --- a/sys/arm64/include/hypervisor.h +++ b/sys/arm64/include/hypervisor.h @@ -61,10 +61,11 @@ #define CPTR_RES1 0x000033ff #endif #define CPTR_TFP 0x00000400 +#define CPTR_TTA 0x00100000 /* Valid if HCR_EL2.E2H == 1 */ #define CPTR_E2H_FPEN 0x00300000 +#define CPTR_E2H_TTA 0x10000000 /* Unconditionally valid */ -#define CPTR_TTA 0x00100000 #define CPTR_TCPAC 0x80000000 /* HCR_EL2 - Hypervisor Config Register */ From 3c62b50ba272d272a1370634c8fda6f6a774e0b1 Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Wed, 11 Sep 2024 10:38:27 +0100 Subject: [PATCH 136/145] arm64: Add CPTR_TRAP_ALL and use it in vmm Add a new macro that enables all CPTR_EL2 traps. This helps ensure we trap all extensions we don't support. Sponsored by: Arm Ltd Differential Revision: https://reviews.freebsd.org/D46516 (cherry picked from commit d54631360276d3fdbaa9a7872f8af82f1f4287da) --- sys/arm64/include/hypervisor.h | 2 ++ sys/arm64/vmm/vmm_reset.c | 5 +++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/sys/arm64/include/hypervisor.h b/sys/arm64/include/hypervisor.h index 16480b560617..886534e3ab25 100644 --- a/sys/arm64/include/hypervisor.h +++ b/sys/arm64/include/hypervisor.h @@ -53,6 +53,7 @@ /* CPTR_EL2 - Architecture feature trap register */ /* Valid if HCR_EL2.E2H == 0 */ +#define CPTR_TRAP_ALL 0xc01037ff /* Enable all traps */ #define CPTR_RES0 0x7fefc800 #if __has_feature(capabilities) #define CPTR_RES1 0x000031ff @@ -63,6 +64,7 @@ #define CPTR_TFP 0x00000400 #define CPTR_TTA 0x00100000 /* Valid if HCR_EL2.E2H == 1 */ +#define CPTR_E2H_TRAP_ALL 0xd0000000 #define CPTR_E2H_FPEN 0x00300000 #define CPTR_E2H_TTA 0x10000000 /* Unconditionally valid */ diff --git a/sys/arm64/vmm/vmm_reset.c b/sys/arm64/vmm/vmm_reset.c index 7042bc2695be..a55c50b2f3dd 100644 --- a/sys/arm64/vmm/vmm_reset.c +++ b/sys/arm64/vmm/vmm_reset.c @@ -178,9 +178,10 @@ reset_vm_el2_regs(void *vcpu) * and floating point functionality to EL2. */ if (in_vhe()) - el2ctx->cptr_el2 = CPACR_FPEN_TRAP_NONE; + el2ctx->cptr_el2 = CPTR_E2H_TRAP_ALL | CPTR_E2H_FPEN; else - el2ctx->cptr_el2 = CPTR_RES1; + el2ctx->cptr_el2 = CPTR_TRAP_ALL & ~CPTR_TFP; + el2ctx->cptr_el2 &= ~CPTR_TCPAC; /* * Disable interrupts in the guest. The guest OS will re-enable * them. From 461a2fb4a2af3a8ed60ca6cf77463e70ca6c1d77 Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Fri, 27 Sep 2024 14:41:08 +0100 Subject: [PATCH 137/145] arm64: Don't trap SVE to EL2 As with floating point instructions don't trap SVE instructions to the hypervisor. This lets us handle then in the kernel. Reviewed by: imp (earlier version) Sponsored by: Arm Ltd Differential Revision: https://reviews.freebsd.org/D43303 (cherry picked from commit fe5ed2496e44aec018a6215175bba225b20d81fd) --- sys/arm64/include/hypervisor.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sys/arm64/include/hypervisor.h b/sys/arm64/include/hypervisor.h index 886534e3ab25..2a07a23f94ac 100644 --- a/sys/arm64/include/hypervisor.h +++ b/sys/arm64/include/hypervisor.h @@ -56,15 +56,16 @@ #define CPTR_TRAP_ALL 0xc01037ff /* Enable all traps */ #define CPTR_RES0 0x7fefc800 #if __has_feature(capabilities) -#define CPTR_RES1 0x000031ff +#define CPTR_RES1 0x000030ff #define CPTR_TC 0x00000200 /* Trap Capabilities */ #else -#define CPTR_RES1 0x000033ff +#define CPTR_RES1 0x000032ff #endif #define CPTR_TFP 0x00000400 #define CPTR_TTA 0x00100000 /* Valid if HCR_EL2.E2H == 1 */ #define CPTR_E2H_TRAP_ALL 0xd0000000 +#define CPTR_E2H_ZPEN 0x00030000 #define CPTR_E2H_FPEN 0x00300000 #define CPTR_E2H_TTA 0x10000000 /* Unconditionally valid */ From 424a25a7d99c89a97845fd0a8639f09700f5ebf3 Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Fri, 22 Nov 2024 13:05:30 -0500 Subject: [PATCH 138/145] Morello: Add constant for CEN field in CPTR_EL2 when VHE is enabled --- sys/arm64/include/hypervisor.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sys/arm64/include/hypervisor.h b/sys/arm64/include/hypervisor.h index 2a07a23f94ac..cd00f1198708 100644 --- a/sys/arm64/include/hypervisor.h +++ b/sys/arm64/include/hypervisor.h @@ -66,6 +66,9 @@ /* Valid if HCR_EL2.E2H == 1 */ #define CPTR_E2H_TRAP_ALL 0xd0000000 #define CPTR_E2H_ZPEN 0x00030000 +#if __has_feature(capabilities) +#define CPTR_E2H_CEN 0x000c0000 +#endif #define CPTR_E2H_FPEN 0x00300000 #define CPTR_E2H_TTA 0x10000000 /* Unconditionally valid */ From 808fb2a504f352069ce49eebb8e0698cc97d1f10 Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Fri, 22 Nov 2024 13:06:05 -0500 Subject: [PATCH 139/145] Morello: Don't trap Morello instructions in EL2 for the kernel under VHE --- sys/arm64/arm64/locore.S | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sys/arm64/arm64/locore.S b/sys/arm64/arm64/locore.S index d8b1ae15967a..ee015c8bfe75 100644 --- a/sys/arm64/arm64/locore.S +++ b/sys/arm64/arm64/locore.S @@ -563,7 +563,11 @@ LENTRY(enter_kernel_el) isb msr SCTLR_EL12_REG, x2 +#if __has_feature(capabilities) + ldr x2, =(CPTR_E2H_CEN) +#else mov x2, xzr /* CPTR_EL2 is managed by vfp.c */ +#endif ldr x3, =(CNTHCTL_E2H_EL1PCTEN | CNTHCTL_E2H_EL1PTEN) ldr x5, =(PSR_DAIF | PSR_M_EL2h) b .Ldone_vhe From f2867b873653dd88d2882f122a28ccace0cb8e7c Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Fri, 22 Nov 2024 13:06:48 -0500 Subject: [PATCH 140/145] vmm: Don't trap access to Morello instructions for guests --- sys/arm64/vmm/vmm_reset.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/sys/arm64/vmm/vmm_reset.c b/sys/arm64/vmm/vmm_reset.c index a55c50b2f3dd..bb8867e7fdef 100644 --- a/sys/arm64/vmm/vmm_reset.c +++ b/sys/arm64/vmm/vmm_reset.c @@ -181,6 +181,13 @@ reset_vm_el2_regs(void *vcpu) el2ctx->cptr_el2 = CPTR_E2H_TRAP_ALL | CPTR_E2H_FPEN; else el2ctx->cptr_el2 = CPTR_TRAP_ALL & ~CPTR_TFP; +#if __has_feature(capabilities) + /* Don't trap accesses to capability registers. */ + if (in_vhe()) + el2ctx->cptr_el2 |= CPTR_E2H_CEN; + else + el2ctx->cptr_el2 &= ~CPTR_TC; +#endif el2ctx->cptr_el2 &= ~CPTR_TCPAC; /* * Disable interrupts in the guest. The guest OS will re-enable From d291e9d3c725b540380fd2320d3a97d6677fc9ab Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Fri, 22 Nov 2024 16:27:44 -0500 Subject: [PATCH 141/145] locore: Set CPTR_EL2 before using any Morello instructions This was a regression in the merge of the VHE changes which moved the setting of VBAR_EL2 before CPTR_EL2. --- sys/arm64/arm64/locore.S | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/sys/arm64/arm64/locore.S b/sys/arm64/arm64/locore.S index ee015c8bfe75..2304c5ac62ed 100644 --- a/sys/arm64/arm64/locore.S +++ b/sys/arm64/arm64/locore.S @@ -568,11 +568,29 @@ LENTRY(enter_kernel_el) #else mov x2, xzr /* CPTR_EL2 is managed by vfp.c */ #endif + msr cptr_el2, x2 +#if __has_feature(capabilities) + /* + * Wait for the write to cptr_el2 to complete. It will enable the + * use of capabilities at EL2 that we need below. When not using + * capabilities this is unneeded as the eret instruction will + * act as in place of this barrier. + */ + isb +#endif + ldr x3, =(CNTHCTL_E2H_EL1PCTEN | CNTHCTL_E2H_EL1PTEN) ldr x5, =(PSR_DAIF | PSR_M_EL2h) b .Ldone_vhe .Lno_vhe: + ldr x2, =(CPTR_RES1) + msr cptr_el2, x2 +#if __has_feature(capabilities) + /* As noted above, wait for the write to cptr_el2 to complete. */ + isb +#endif + /* Hypervisor trap functions */ adrp x2, hyp_stub_vectors add x2, x2, :lo12:hyp_stub_vectors @@ -583,23 +601,9 @@ LENTRY(enter_kernel_el) msr vbar_el2, x2 #endif - ldr x2, =(CPTR_RES1) ldr x3, =(CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN) ldr x5, =(PSR_DAIF | PSR_M_EL1h) - .Ldone_vhe: - - msr cptr_el2, x2 -#if __has_feature(capabilities) - /* - * Wait for the write to cptr_el2 to complete. It will enable the - * use of capabilities at EL2 that we need below. When not using - * capabilities this is unneeded as the eret instruction will - * act as in place of this barrier. - */ - isb -#endif - /* Enable access to the physical timers at EL1 */ msr cnthctl_el2, x3 /* Set the return PSTATE */ From 9d00b3213dd4c7a96612497544f3a61ac3dd66ea Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Fri, 22 Nov 2024 16:32:55 -0500 Subject: [PATCH 142/145] locore: Don't clear DDC too early in the VHE case With VHE, the kernel runs in EL2, so defer clearing DDC until locore has finished using DDC. In the case of VHE, clear DDC_EL1 to be on the safe side. --- sys/arm64/arm64/locore.S | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/sys/arm64/arm64/locore.S b/sys/arm64/arm64/locore.S index 2304c5ac62ed..feb6c7963b58 100644 --- a/sys/arm64/arm64/locore.S +++ b/sys/arm64/arm64/locore.S @@ -577,6 +577,9 @@ LENTRY(enter_kernel_el) * act as in place of this barrier. */ isb + + /* Clear DDC_EL1 */ + msr ddc_el1, czr #endif ldr x3, =(CNTHCTL_E2H_EL1PCTEN | CNTHCTL_E2H_EL1PTEN) @@ -603,6 +606,11 @@ LENTRY(enter_kernel_el) ldr x3, =(CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN) ldr x5, =(PSR_DAIF | PSR_M_EL1h) + +#ifdef __CHERI_PURE_CAPABILITY__ + /* Clear DDC_EL2 */ + msr ddc, czr +#endif .Ldone_vhe: /* Enable access to the physical timers at EL1 */ msr cnthctl_el2, x3 @@ -636,9 +644,6 @@ LENTRY(enter_kernel_el) mrs x2, cctlr_el2 orr x2, x2, #(CCTLR_EL2_C64E_MASK) msr cctlr_el2, x2 - - /* Clear DDC_EL2 */ - msr ddc, czr #endif /* Set the address to return to our return address */ #if __has_feature(capabilities) From 2924ab0a6761c4b6391c880db2ffc1855438e262 Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Fri, 22 Nov 2024 17:28:45 -0500 Subject: [PATCH 143/145] vmm: Ensure guests start with a cleared CCTLR_EL1 --- sys/arm64/vmm/vmm_reset.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sys/arm64/vmm/vmm_reset.c b/sys/arm64/vmm/vmm_reset.c index bb8867e7fdef..9946e2baae95 100644 --- a/sys/arm64/vmm/vmm_reset.c +++ b/sys/arm64/vmm/vmm_reset.c @@ -70,6 +70,9 @@ reset_vm_el01_regs(void *vcpu) set_arch_unknown(el2ctx->mdccint_el1); set_arch_unknown(el2ctx->mdscr_el1); set_arch_unknown(el2ctx->par_el1); +#if __has_feature(capabilities) + set_arch_unknown(el2ctx->cctlr_el1); +#endif /* * Guest starts with: From 1e64dbfc0e01488ed64aa43bdb28b891d64bc5be Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Fri, 22 Nov 2024 17:29:16 -0500 Subject: [PATCH 144/145] vmm: CCTLR_EL1 uses an alternate register (CCTLR_EL12) under VHE --- sys/arm64/include/armreg.h | 14 ++++++++++++++ sys/arm64/vmm/vmm_hyp.c | 4 ++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/sys/arm64/include/armreg.h b/sys/arm64/include/armreg.h index 64587220507b..925c67a4f752 100644 --- a/sys/arm64/include/armreg.h +++ b/sys/arm64/include/armreg.h @@ -394,6 +394,20 @@ * CCTLR_EL1/2 - Capability Control Register * The rest of the fields mirror CCTLR_EL0 */ +#define CCTLR_EL1_REG MRS_REG_ALT_NAME(CCTLR_EL1) +#define CCTLR_EL1_op0 3 +#define CCTLR_EL1_op1 0 +#define CCTLR_EL1_CRn 1 +#define CCTLR_EL1_CRm 2 +#define CCTLR_EL1_op2 2 + +#define CCTLR_EL12_REG MRS_REG_ALT_NAME(CCTLR_EL12) +#define CCTLR_EL12_op0 3 +#define CCTLR_EL12_op1 5 +#define CCTLR_EL12_CRn 1 +#define CCTLR_EL12_CRm 2 +#define CCTLR_EL12_op2 2 + #define CCTLR_EL1_C64E_MASK (0x1 << 5) /* Enable C64 mode upon exception */ #define CCTLR_EL1_TGEN1_MASK (0x1 << 1) /* Page table CLG bit for TTBR1 */ #define CCTLR_EL1_TGEN0_MASK (0x1 << 0) /* Page table CLG bit for TTBR0 */ diff --git a/sys/arm64/vmm/vmm_hyp.c b/sys/arm64/vmm/vmm_hyp.c index 376f7c2d240b..f9c7cb0bbd7f 100644 --- a/sys/arm64/vmm/vmm_hyp.c +++ b/sys/arm64/vmm/vmm_hyp.c @@ -233,7 +233,6 @@ vmm_hyp_reg_store(struct hypctx *hypctx, struct hyp *hyp, bool guest) /* Store the guest special registers */ #if __has_feature(capabilities) hypctx->cctlr_el0 = READ_SPECIALREG(cctlr_el0); - hypctx->cctlr_el1 = READ_SPECIALREG(cctlr_el1); hypctx->cid_el0 = READ_SPECIALREG_CAP(cid_el0); hypctx->ddc_el0 = READ_SPECIALREG_CAP(ddc_el0); hypctx->rcsp_el0 = READ_SPECIALREG_CAP(rcsp_el0); @@ -259,6 +258,7 @@ vmm_hyp_reg_store(struct hypctx *hypctx, struct hyp *hyp, bool guest) if (guest_or_nonvhe(guest)) { #if __has_feature(capabilities) hypctx->elr_el1 = READ_SPECIALREG_CAP(EL1_REG(ELR)); + hypctx->cctlr_el1 = READ_SPECIALREG(EL1_REG(CCTLR)); hypctx->vbar_el1 = READ_SPECIALREG_CAP(EL1_REG(VBAR)); #else hypctx->elr_el1 = READ_SPECIALREG(EL1_REG(ELR)); @@ -299,7 +299,6 @@ vmm_hyp_reg_restore(struct hypctx *hypctx, struct hyp *hyp, bool guest) #if __has_feature(capabilities) WRITE_SPECIALREG(cctlr_el0, hypctx->cctlr_el0); - WRITE_SPECIALREG(cctlr_el1, hypctx->cctlr_el1); WRITE_SPECIALREG_CAP(cid_el0, hypctx->cid_el0); WRITE_SPECIALREG_CAP(ddc_el0, hypctx->ddc_el0); WRITE_SPECIALREG_CAP(rcsp_el0, hypctx->rcsp_el0); @@ -325,6 +324,7 @@ vmm_hyp_reg_restore(struct hypctx *hypctx, struct hyp *hyp, bool guest) if (guest_or_nonvhe(guest)) { #if __has_feature(capabilities) WRITE_SPECIALREG_CAP(EL1_REG(ELR), hypctx->elr_el1); + WRITE_SPECIALREG(EL1_REG(CCTLR), hypctx->cctlr_el1); WRITE_SPECIALREG_CAP(EL1_REG(VBAR), hypctx->vbar_el1); #else WRITE_SPECIALREG(EL1_REG(ELR), hypctx->elr_el1); From ad5227ed16ac6c70ef319a0fb2e2cfe41f81716c Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Tue, 3 Dec 2024 09:59:31 -0500 Subject: [PATCH 145/145] Merged through August 23, 2024 --- .last_merge | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.last_merge b/.last_merge index e5b78812092f..67bcf88fc09a 100644 --- a/.last_merge +++ b/.last_merge @@ -1 +1 @@ -freebsd-main-20240816 +freebsd-main-20240823